mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'v4' into feature/docwise-generator-batching
# Conflicts: # spacy/kb/kb.pyx # spacy/kb/kb_in_memory.pyx # spacy/ml/models/entity_linker.py # spacy/pipeline/entity_linker.py # spacy/tests/pipeline/test_entity_linker.py # website/docs/api/entitylinker.mdx
This commit is contained in:
		
						commit
						8aa59c4f65
					
				
							
								
								
									
										129
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										129
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -1,129 +0,0 @@
 | 
				
			||||||
parameters:
 | 
					 | 
				
			||||||
  python_version: ''
 | 
					 | 
				
			||||||
  architecture: 'x64'
 | 
					 | 
				
			||||||
  num_build_jobs: 2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
steps:
 | 
					 | 
				
			||||||
  - task: UsePythonVersion@0
 | 
					 | 
				
			||||||
    inputs:
 | 
					 | 
				
			||||||
      versionSpec: ${{ parameters.python_version }}
 | 
					 | 
				
			||||||
      architecture: ${{ parameters.architecture }}
 | 
					 | 
				
			||||||
      allowUnstable: true
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - bash: |
 | 
					 | 
				
			||||||
      echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
 | 
					 | 
				
			||||||
    displayName: 'Set variables'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m pip install -U build pip setuptools
 | 
					 | 
				
			||||||
      python -m pip install -U -r requirements.txt
 | 
					 | 
				
			||||||
    displayName: "Install dependencies"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m build --sdist
 | 
					 | 
				
			||||||
    displayName: "Build sdist"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m mypy spacy
 | 
					 | 
				
			||||||
    displayName: 'Run mypy'
 | 
					 | 
				
			||||||
    condition: ne(variables['python_version'], '3.6')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - task: DeleteFiles@1
 | 
					 | 
				
			||||||
    inputs:
 | 
					 | 
				
			||||||
      contents: "spacy"
 | 
					 | 
				
			||||||
    displayName: "Delete source directory"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - task: DeleteFiles@1
 | 
					 | 
				
			||||||
    inputs:
 | 
					 | 
				
			||||||
      contents: "*.egg-info"
 | 
					 | 
				
			||||||
    displayName: "Delete egg-info directory"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m pip freeze > installed.txt
 | 
					 | 
				
			||||||
      python -m pip uninstall -y -r installed.txt
 | 
					 | 
				
			||||||
    displayName: "Uninstall all packages"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - bash: |
 | 
					 | 
				
			||||||
      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
 | 
					 | 
				
			||||||
      SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
 | 
					 | 
				
			||||||
    displayName: "Install from sdist"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -W error -c "import spacy"
 | 
					 | 
				
			||||||
    displayName: "Test import"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#  - script: |
 | 
					 | 
				
			||||||
#      python -m spacy download ca_core_news_sm
 | 
					 | 
				
			||||||
#      python -m spacy download ca_core_news_md
 | 
					 | 
				
			||||||
#      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
 | 
					 | 
				
			||||||
#    displayName: 'Test download CLI'
 | 
					 | 
				
			||||||
#    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#  - script: |
 | 
					 | 
				
			||||||
#      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 | 
					 | 
				
			||||||
#    displayName: 'Test no warnings on load (#11713)'
 | 
					 | 
				
			||||||
#    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#  - script: |
 | 
					 | 
				
			||||||
#      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
 | 
					 | 
				
			||||||
#    displayName: 'Test skip re-download (#12188)'
 | 
					 | 
				
			||||||
#    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#  - script: |
 | 
					 | 
				
			||||||
#      python -W error -m spacy info ca_core_news_sm | grep -q download_url
 | 
					 | 
				
			||||||
#    displayName: 'Test download_url in info CLI'
 | 
					 | 
				
			||||||
#    condition: eq(variables['python_version'] '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
 | 
					 | 
				
			||||||
    displayName: 'Test convert CLI'
 | 
					 | 
				
			||||||
    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m spacy init config -p ner -l ca ner.cfg
 | 
					 | 
				
			||||||
      python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
 | 
					 | 
				
			||||||
    displayName: 'Test debug config CLI'
 | 
					 | 
				
			||||||
    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      # will have errors due to sparse data, check for summary in output
 | 
					 | 
				
			||||||
      python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
 | 
					 | 
				
			||||||
    displayName: 'Test debug data CLI'
 | 
					 | 
				
			||||||
    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
 | 
					 | 
				
			||||||
    displayName: 'Test train CLI'
 | 
					 | 
				
			||||||
    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#  - script: |
 | 
					 | 
				
			||||||
#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
 | 
					 | 
				
			||||||
#      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
 | 
					 | 
				
			||||||
#    displayName: 'Test assemble CLI'
 | 
					 | 
				
			||||||
#    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
#  - script: |
 | 
					 | 
				
			||||||
#      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
 | 
					 | 
				
			||||||
#      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
 | 
					 | 
				
			||||||
#    displayName: 'Test assemble CLI vectors warning'
 | 
					 | 
				
			||||||
#    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m pip install -U -r requirements.txt
 | 
					 | 
				
			||||||
    displayName: "Install test requirements"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m pytest --pyargs spacy -W error
 | 
					 | 
				
			||||||
    displayName: "Run CPU tests"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python -m pip install 'spacy[apple]'
 | 
					 | 
				
			||||||
      python -m pytest --pyargs spacy
 | 
					 | 
				
			||||||
    displayName: "Run CPU tests with thinc-apple-ops"
 | 
					 | 
				
			||||||
    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - script: |
 | 
					 | 
				
			||||||
      python .github/validate_universe_json.py website/meta/universe.json
 | 
					 | 
				
			||||||
    displayName: 'Test website/meta/universe.json'
 | 
					 | 
				
			||||||
    condition: eq(variables['python_version'], '3.8')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
							
								
								
									
										45
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										45
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -1,45 +0,0 @@
 | 
				
			||||||
# GitHub Action that uses Black to reformat all Python code and submits a PR
 | 
					 | 
				
			||||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
name: autoblack
 | 
					 | 
				
			||||||
on:
 | 
					 | 
				
			||||||
  workflow_dispatch:  # allow manual trigger
 | 
					 | 
				
			||||||
  schedule:
 | 
					 | 
				
			||||||
    - cron: '0 8 * * 5'  # every Friday at 8am UTC
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
jobs:
 | 
					 | 
				
			||||||
  autoblack:
 | 
					 | 
				
			||||||
    if: github.repository_owner == 'explosion'
 | 
					 | 
				
			||||||
    runs-on: ubuntu-latest
 | 
					 | 
				
			||||||
    steps:
 | 
					 | 
				
			||||||
      - uses: actions/checkout@v3
 | 
					 | 
				
			||||||
        with:
 | 
					 | 
				
			||||||
            ref: ${{ github.head_ref }}
 | 
					 | 
				
			||||||
      - uses: actions/setup-python@v4
 | 
					 | 
				
			||||||
      - run: pip install black -c requirements.txt
 | 
					 | 
				
			||||||
      - name: Auto-format code if needed
 | 
					 | 
				
			||||||
        run: black spacy
 | 
					 | 
				
			||||||
      # We can't run black --check here because that returns a non-zero excit
 | 
					 | 
				
			||||||
      # code and makes GitHub think the action failed
 | 
					 | 
				
			||||||
      - name: Check for modified files
 | 
					 | 
				
			||||||
        id: git-check
 | 
					 | 
				
			||||||
        run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      - name: Create Pull Request
 | 
					 | 
				
			||||||
        if: steps.git-check.outputs.modified == 'true'
 | 
					 | 
				
			||||||
        uses: peter-evans/create-pull-request@v4
 | 
					 | 
				
			||||||
        with:
 | 
					 | 
				
			||||||
            title: Auto-format code with black
 | 
					 | 
				
			||||||
            labels: meta
 | 
					 | 
				
			||||||
            commit-message: Auto-format code with black
 | 
					 | 
				
			||||||
            committer: GitHub <noreply@github.com>
 | 
					 | 
				
			||||||
            author: explosion-bot <explosion-bot@users.noreply.github.com>
 | 
					 | 
				
			||||||
            body: _This PR is auto-generated._
 | 
					 | 
				
			||||||
            branch: autoblack
 | 
					 | 
				
			||||||
            delete-branch: true
 | 
					 | 
				
			||||||
            draft: false
 | 
					 | 
				
			||||||
      - name: Check outputs
 | 
					 | 
				
			||||||
        if: steps.git-check.outputs.modified == 'true'
 | 
					 | 
				
			||||||
        run: |
 | 
					 | 
				
			||||||
          echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
 | 
					 | 
				
			||||||
          echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
 | 
					 | 
				
			||||||
							
								
								
									
										1
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -8,6 +8,7 @@ on:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
jobs:
 | 
					jobs:
 | 
				
			||||||
  explosion-bot:
 | 
					  explosion-bot:
 | 
				
			||||||
 | 
					    if: github.repository_owner == 'explosion'
 | 
				
			||||||
    runs-on: ubuntu-latest
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
      - name: Dump GitHub context
 | 
					      - name: Dump GitHub context
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -13,6 +13,7 @@ on:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
jobs:
 | 
					jobs:
 | 
				
			||||||
  issue-manager:
 | 
					  issue-manager:
 | 
				
			||||||
 | 
					    if: github.repository_owner == 'explosion'
 | 
				
			||||||
    runs-on: ubuntu-latest
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
      - uses: tiangolo/issue-manager@0.4.0
 | 
					      - uses: tiangolo/issue-manager@0.4.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -13,6 +13,7 @@ concurrency:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
jobs:
 | 
					jobs:
 | 
				
			||||||
  action:
 | 
					  action:
 | 
				
			||||||
 | 
					    if: github.repository_owner == 'explosion'
 | 
				
			||||||
    runs-on: ubuntu-latest
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
      - uses: dessant/lock-threads@v4
 | 
					      - uses: dessant/lock-threads@v4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -7,6 +7,7 @@ on:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
jobs:
 | 
					jobs:
 | 
				
			||||||
  build:
 | 
					  build:
 | 
				
			||||||
 | 
					    if: github.repository_owner == 'explosion'
 | 
				
			||||||
    runs-on: ubuntu-latest
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										179
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										179
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,179 @@
 | 
				
			||||||
 | 
					name: tests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					on:
 | 
				
			||||||
 | 
					  push:
 | 
				
			||||||
 | 
					    branches-ignore:
 | 
				
			||||||
 | 
					      - "spacy.io"
 | 
				
			||||||
 | 
					      - "nightly.spacy.io"
 | 
				
			||||||
 | 
					      - "v2.spacy.io"
 | 
				
			||||||
 | 
					    paths-ignore:
 | 
				
			||||||
 | 
					      - "*.md"
 | 
				
			||||||
 | 
					      - "*.mdx"
 | 
				
			||||||
 | 
					      - "website/**"
 | 
				
			||||||
 | 
					      - ".github/workflows/**"
 | 
				
			||||||
 | 
					  pull_request:
 | 
				
			||||||
 | 
					    types: [opened, synchronize, reopened, edited]
 | 
				
			||||||
 | 
					    paths-ignore:
 | 
				
			||||||
 | 
					      - "*.md"
 | 
				
			||||||
 | 
					      - "*.mdx"
 | 
				
			||||||
 | 
					      - "website/**"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					jobs:
 | 
				
			||||||
 | 
					  validate:
 | 
				
			||||||
 | 
					    name: Validate
 | 
				
			||||||
 | 
					    if: github.repository_owner == 'explosion'
 | 
				
			||||||
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
 | 
					    steps:
 | 
				
			||||||
 | 
					      - name: Check out repo
 | 
				
			||||||
 | 
					        uses: actions/checkout@v3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Configure Python version
 | 
				
			||||||
 | 
					        uses: actions/setup-python@v4
 | 
				
			||||||
 | 
					        with:
 | 
				
			||||||
 | 
					          python-version: "3.8"
 | 
				
			||||||
 | 
					          architecture: x64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: black
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install black -c requirements.txt
 | 
				
			||||||
 | 
					          python -m black spacy --check
 | 
				
			||||||
 | 
					      - name: isort
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install isort -c requirements.txt
 | 
				
			||||||
 | 
					          python -m isort spacy --check
 | 
				
			||||||
 | 
					      - name: flake8
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install flake8==5.0.4
 | 
				
			||||||
 | 
					          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
 | 
				
			||||||
 | 
					      - name: cython-lint
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install cython-lint -c requirements.txt
 | 
				
			||||||
 | 
					          # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
 | 
				
			||||||
 | 
					          cython-lint spacy --ignore E501,W291,E266
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  tests:
 | 
				
			||||||
 | 
					    name: Test
 | 
				
			||||||
 | 
					    needs: Validate
 | 
				
			||||||
 | 
					    strategy:
 | 
				
			||||||
 | 
					      fail-fast: true
 | 
				
			||||||
 | 
					      matrix:
 | 
				
			||||||
 | 
					        os: [ubuntu-latest, windows-latest, macos-latest]
 | 
				
			||||||
 | 
					        python_version: ["3.11"]
 | 
				
			||||||
 | 
					        include:
 | 
				
			||||||
 | 
					          - os: macos-latest
 | 
				
			||||||
 | 
					            python_version: "3.8"
 | 
				
			||||||
 | 
					          - os: ubuntu-20.04
 | 
				
			||||||
 | 
					            python_version: "3.9"
 | 
				
			||||||
 | 
					          - os: windows-latest
 | 
				
			||||||
 | 
					            python_version: "3.10"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    runs-on: ${{ matrix.os }}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    steps:
 | 
				
			||||||
 | 
					      - name: Check out repo
 | 
				
			||||||
 | 
					        uses: actions/checkout@v3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Configure Python version
 | 
				
			||||||
 | 
					        uses: actions/setup-python@v4
 | 
				
			||||||
 | 
					        with:
 | 
				
			||||||
 | 
					          python-version: ${{ matrix.python_version }}
 | 
				
			||||||
 | 
					          architecture: x64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Install dependencies
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install -U build pip setuptools
 | 
				
			||||||
 | 
					          python -m pip install -U -r requirements.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Build sdist
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m build --sdist
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Run mypy
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m mypy spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Delete source directory and .egg-info
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          rm -rf spacy *.egg-info
 | 
				
			||||||
 | 
					        shell: bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Uninstall all packages
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip freeze
 | 
				
			||||||
 | 
					          python -m pip freeze --exclude pywin32 > installed.txt
 | 
				
			||||||
 | 
					          python -m pip uninstall -y -r installed.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Install from sdist
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
 | 
				
			||||||
 | 
					          SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
 | 
				
			||||||
 | 
					        shell: bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Test import
 | 
				
			||||||
 | 
					        run: python -W error -c "import spacy"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      #      - name: "Test download CLI"
 | 
				
			||||||
 | 
					      #        run: |
 | 
				
			||||||
 | 
					      #          python -m spacy download ca_core_news_sm
 | 
				
			||||||
 | 
					      #          python -m spacy download ca_core_news_md
 | 
				
			||||||
 | 
					      #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
 | 
				
			||||||
 | 
					      #        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					      #
 | 
				
			||||||
 | 
					      #      - name: "Test download_url in info CLI"
 | 
				
			||||||
 | 
					      #        run: |
 | 
				
			||||||
 | 
					      #          python -W error -m spacy info ca_core_news_sm | grep -q download_url
 | 
				
			||||||
 | 
					      #        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					      #
 | 
				
			||||||
 | 
					      #      - name: "Test no warnings on load (#11713)"
 | 
				
			||||||
 | 
					      #        run: |
 | 
				
			||||||
 | 
					      #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
 | 
				
			||||||
 | 
					      #        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Test convert CLI"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
 | 
				
			||||||
 | 
					        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Test debug config CLI"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m spacy init config -p ner -l ca ner.cfg
 | 
				
			||||||
 | 
					          python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
 | 
				
			||||||
 | 
					        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Test debug data CLI"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          # will have errors due to sparse data, check for summary in output
 | 
				
			||||||
 | 
					          python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
 | 
				
			||||||
 | 
					        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Test train CLI"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
 | 
				
			||||||
 | 
					        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      #      - name: "Test assemble CLI"
 | 
				
			||||||
 | 
					      #        run: |
 | 
				
			||||||
 | 
					      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
 | 
				
			||||||
 | 
					      #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
 | 
				
			||||||
 | 
					      #        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					      #
 | 
				
			||||||
 | 
					      #      - name: "Test assemble CLI vectors warning"
 | 
				
			||||||
 | 
					      #        run: |
 | 
				
			||||||
 | 
					      #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
 | 
				
			||||||
 | 
					      #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
 | 
				
			||||||
 | 
					      #        if: matrix.python_version == '3.9'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Install test requirements"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install -U -r requirements.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Run CPU tests"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pytest --pyargs spacy -W error
 | 
				
			||||||
 | 
					        if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: "Run CPU tests with thinc-apple-ops"
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python -m pip install 'spacy[apple]'
 | 
				
			||||||
 | 
					          python -m pytest --pyargs spacy
 | 
				
			||||||
 | 
					        if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
 | 
				
			||||||
							
								
								
									
										33
									
								
								.github/workflows/universe_validation.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.github/workflows/universe_validation.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,33 @@
 | 
				
			||||||
 | 
					name: universe validation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					on:
 | 
				
			||||||
 | 
					  push:
 | 
				
			||||||
 | 
					    branches-ignore:
 | 
				
			||||||
 | 
					      - "spacy.io"
 | 
				
			||||||
 | 
					      - "nightly.spacy.io"
 | 
				
			||||||
 | 
					      - "v2.spacy.io"
 | 
				
			||||||
 | 
					    paths:
 | 
				
			||||||
 | 
					      - "website/meta/universe.json"
 | 
				
			||||||
 | 
					  pull_request:
 | 
				
			||||||
 | 
					    types: [opened, synchronize, reopened, edited]
 | 
				
			||||||
 | 
					    paths:
 | 
				
			||||||
 | 
					      - "website/meta/universe.json"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					jobs:
 | 
				
			||||||
 | 
					  validate:
 | 
				
			||||||
 | 
					    name: Validate
 | 
				
			||||||
 | 
					    if: github.repository_owner == 'explosion'
 | 
				
			||||||
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
 | 
					    steps:
 | 
				
			||||||
 | 
					      - name: Check out repo
 | 
				
			||||||
 | 
					        uses: actions/checkout@v3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Configure Python version
 | 
				
			||||||
 | 
					        uses: actions/setup-python@v4
 | 
				
			||||||
 | 
					        with:
 | 
				
			||||||
 | 
					          python-version: "3.8"
 | 
				
			||||||
 | 
					          architecture: x64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: Validate website/meta/universe.json
 | 
				
			||||||
 | 
					        run: |
 | 
				
			||||||
 | 
					          python .github/validate_universe_json.py website/meta/universe.json
 | 
				
			||||||
							
								
								
									
										2
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Makefile
									
									
									
									
									
								
							| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
SHELL := /bin/bash
 | 
					SHELL := /bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef SPACY_EXTRAS
 | 
					ifndef SPACY_EXTRAS
 | 
				
			||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
 | 
					override SPACY_EXTRAS = spacy-lookups-data==1.0.3
 | 
				
			||||||
endif
 | 
					endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ifndef PYVER
 | 
					ifndef PYVER
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										32
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								README.md
									
									
									
									
									
								
							| 
						 | 
					@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
 | 
				
			||||||
model packaging, deployment and workflow management. spaCy is commercial
 | 
					model packaging, deployment and workflow management. spaCy is commercial
 | 
				
			||||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 | 
					open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					💥 **We'd love to hear more about your experience with spaCy!**
 | 
				
			||||||
 | 
					[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
💫 **Version 3.5 out now!**
 | 
					💫 **Version 3.5 out now!**
 | 
				
			||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
					[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 📖 Documentation
 | 
					## 📖 Documentation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Documentation                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                              |
 | 
					| Documentation                 |                                                                        |
 | 
				
			||||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ----------------------------- | ---------------------------------------------------------------------- |
 | 
				
			||||||
| ⭐️ **[spaCy 101]**                                                                                                                                                                                                       | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                                            |
 | 
					| ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      |
 | 
				
			||||||
| 📚 **[Usage Guides]**                                                                                                                                                                                                     | How to use spaCy and its features.                                                                                                                                                                                                                                                                                           |
 | 
					| 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     |
 | 
				
			||||||
| 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                               |
 | 
					| 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         |
 | 
				
			||||||
| 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                          |
 | 
					| 🪐 **[Project Templates]**    | End-to-end workflows you can clone, modify and run.                    |
 | 
				
			||||||
| 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                      |
 | 
					| 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                |
 | 
				
			||||||
| 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                        |
 | 
					| 📦 **[Models]**               | Download trained pipelines for spaCy.                                  |
 | 
				
			||||||
| 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                               |
 | 
					| 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         |
 | 
				
			||||||
| 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                      |
 | 
					| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
 | 
				
			||||||
| 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                    |
 | 
					| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
 | 
				
			||||||
| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 |
 | 
					| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
 | 
				
			||||||
| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        |
 | 
					| 🛠 **[Changelog]** | Changes and version history. |
 | 
				
			||||||
 | 
					| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
 | 
				
			||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
 | 
					| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
 | 
				
			||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
 | 
					| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
 | 
				
			||||||
[api reference]: https://spacy.io/api/
 | 
					[api reference]: https://spacy.io/api/
 | 
				
			||||||
[models]: https://spacy.io/models
 | 
					[models]: https://spacy.io/models
 | 
				
			||||||
[universe]: https://spacy.io/universe
 | 
					[universe]: https://spacy.io/universe
 | 
				
			||||||
 | 
					[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
 | 
				
			||||||
[videos]: https://www.youtube.com/c/ExplosionAI
 | 
					[videos]: https://www.youtube.com/c/ExplosionAI
 | 
				
			||||||
[online course]: https://course.spacy.io
 | 
					[online course]: https://course.spacy.io
 | 
				
			||||||
[project templates]: https://github.com/explosion/projects
 | 
					[project templates]: https://github.com/explosion/projects
 | 
				
			||||||
[changelog]: https://spacy.io/usage#changelog
 | 
					[changelog]: https://spacy.io/usage#changelog
 | 
				
			||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 | 
					[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
## 💬 Where to ask questions
 | 
					## 💬 Where to ask questions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
 | 
					The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,99 +0,0 @@
 | 
				
			||||||
trigger:
 | 
					 | 
				
			||||||
  batch: true
 | 
					 | 
				
			||||||
  branches:
 | 
					 | 
				
			||||||
    include:
 | 
					 | 
				
			||||||
      - "*"
 | 
					 | 
				
			||||||
    exclude:
 | 
					 | 
				
			||||||
      - "spacy.io"
 | 
					 | 
				
			||||||
      - "nightly.spacy.io"
 | 
					 | 
				
			||||||
      - "v2.spacy.io"
 | 
					 | 
				
			||||||
  paths:
 | 
					 | 
				
			||||||
    exclude:
 | 
					 | 
				
			||||||
      - "website/*"
 | 
					 | 
				
			||||||
      - "*.md"
 | 
					 | 
				
			||||||
      - "*.mdx"
 | 
					 | 
				
			||||||
      - ".github/workflows/*"
 | 
					 | 
				
			||||||
pr:
 | 
					 | 
				
			||||||
  paths:
 | 
					 | 
				
			||||||
    exclude:
 | 
					 | 
				
			||||||
      - "*.md"
 | 
					 | 
				
			||||||
      - "*.mdx"
 | 
					 | 
				
			||||||
      - "website/docs/*"
 | 
					 | 
				
			||||||
      - "website/src/*"
 | 
					 | 
				
			||||||
      - "website/meta/*.tsx"
 | 
					 | 
				
			||||||
      - "website/meta/*.mjs"
 | 
					 | 
				
			||||||
      - "website/meta/languages.json"
 | 
					 | 
				
			||||||
      - "website/meta/site.json"
 | 
					 | 
				
			||||||
      - "website/meta/sidebars.json"
 | 
					 | 
				
			||||||
      - "website/meta/type-annotations.json"
 | 
					 | 
				
			||||||
      - "website/pages/*"
 | 
					 | 
				
			||||||
      - ".github/workflows/*"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
jobs:
 | 
					 | 
				
			||||||
  # Check formatting and linting. Perform basic checks for most important errors
 | 
					 | 
				
			||||||
  # (syntax etc.) Uses the config defined in setup.cfg and overwrites the
 | 
					 | 
				
			||||||
  # selected codes.
 | 
					 | 
				
			||||||
  - job: "Validate"
 | 
					 | 
				
			||||||
    pool:
 | 
					 | 
				
			||||||
      vmImage: "ubuntu-latest"
 | 
					 | 
				
			||||||
    steps:
 | 
					 | 
				
			||||||
      - task: UsePythonVersion@0
 | 
					 | 
				
			||||||
        inputs:
 | 
					 | 
				
			||||||
          versionSpec: "3.8"
 | 
					 | 
				
			||||||
      - script: |
 | 
					 | 
				
			||||||
          pip install black -c requirements.txt
 | 
					 | 
				
			||||||
          python -m black spacy --check
 | 
					 | 
				
			||||||
        displayName: "black"
 | 
					 | 
				
			||||||
      - script: |
 | 
					 | 
				
			||||||
          pip install flake8==5.0.4
 | 
					 | 
				
			||||||
          python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
 | 
					 | 
				
			||||||
        displayName: "flake8"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - job: "Test"
 | 
					 | 
				
			||||||
    dependsOn: "Validate"
 | 
					 | 
				
			||||||
    strategy:
 | 
					 | 
				
			||||||
      matrix:
 | 
					 | 
				
			||||||
        # We're only running one platform per Python version to speed up builds
 | 
					 | 
				
			||||||
        #        Python38Linux:
 | 
					 | 
				
			||||||
        #          imageName: "ubuntu-latest"
 | 
					 | 
				
			||||||
        #          python.version: "3.8"
 | 
					 | 
				
			||||||
        #        Python38Windows:
 | 
					 | 
				
			||||||
        #          imageName: "windows-latest"
 | 
					 | 
				
			||||||
        #          python.version: "3.8"
 | 
					 | 
				
			||||||
        Python38Mac:
 | 
					 | 
				
			||||||
          imageName: "macos-latest"
 | 
					 | 
				
			||||||
          python.version: "3.8"
 | 
					 | 
				
			||||||
        Python39Linux:
 | 
					 | 
				
			||||||
          imageName: "ubuntu-latest"
 | 
					 | 
				
			||||||
          python.version: "3.9"
 | 
					 | 
				
			||||||
        #        Python39Windows:
 | 
					 | 
				
			||||||
        #          imageName: "windows-latest"
 | 
					 | 
				
			||||||
        #          python.version: "3.9"
 | 
					 | 
				
			||||||
        #        Python39Mac:
 | 
					 | 
				
			||||||
        #          imageName: "macos-latest"
 | 
					 | 
				
			||||||
        #          python.version: "3.9"
 | 
					 | 
				
			||||||
        #        Python310Linux:
 | 
					 | 
				
			||||||
        #          imageName: "ubuntu-latest"
 | 
					 | 
				
			||||||
        #          python.version: "3.10"
 | 
					 | 
				
			||||||
        Python310Windows:
 | 
					 | 
				
			||||||
          imageName: "windows-latest"
 | 
					 | 
				
			||||||
          python.version: "3.10"
 | 
					 | 
				
			||||||
        #        Python310Mac:
 | 
					 | 
				
			||||||
        #          imageName: "macos-latest"
 | 
					 | 
				
			||||||
        #          python.version: "3.10"
 | 
					 | 
				
			||||||
        Python311Linux:
 | 
					 | 
				
			||||||
          imageName: 'ubuntu-latest'
 | 
					 | 
				
			||||||
          python.version: '3.11'
 | 
					 | 
				
			||||||
        Python311Windows:
 | 
					 | 
				
			||||||
          imageName: 'windows-latest'
 | 
					 | 
				
			||||||
          python.version: '3.11'
 | 
					 | 
				
			||||||
        Python311Mac:
 | 
					 | 
				
			||||||
          imageName: 'macos-latest'
 | 
					 | 
				
			||||||
          python.version: '3.11'
 | 
					 | 
				
			||||||
      maxParallel: 4
 | 
					 | 
				
			||||||
    pool:
 | 
					 | 
				
			||||||
      vmImage: $(imageName)
 | 
					 | 
				
			||||||
    steps:
 | 
					 | 
				
			||||||
      - template: .github/azure-steps.yml
 | 
					 | 
				
			||||||
        parameters:
 | 
					 | 
				
			||||||
          python_version: '$(python.version)'
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,4 @@
 | 
				
			||||||
# build version constraints for use with wheelwright + multibuild
 | 
					# build version constraints for use with wheelwright + multibuild
 | 
				
			||||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
 | 
					 | 
				
			||||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 | 
					 | 
				
			||||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 | 
					numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 | 
				
			||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 | 
					numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 | 
				
			||||||
numpy==1.19.3; python_version=='3.9'
 | 
					numpy==1.19.3; python_version=='3.9'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,3 +9,6 @@ requires = [
 | 
				
			||||||
    "numpy>=1.15.0",
 | 
					    "numpy>=1.15.0",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
build-backend = "setuptools.build_meta"
 | 
					build-backend = "setuptools.build_meta"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[tool.isort]
 | 
				
			||||||
 | 
					profile = "black"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.9.1,<1.2.0
 | 
					wasabi>=0.9.1,<1.2.0
 | 
				
			||||||
srsly>=2.4.3,<3.0.0
 | 
					srsly>=2.4.3,<3.0.0
 | 
				
			||||||
catalogue>=2.0.6,<2.1.0
 | 
					catalogue>=2.0.6,<2.1.0
 | 
				
			||||||
typer>=0.3.0,<0.8.0
 | 
					typer>=0.3.0,<0.10.0
 | 
				
			||||||
pathy>=0.10.0
 | 
					pathy>=0.10.0
 | 
				
			||||||
smart-open>=5.2.1,<7.0.0
 | 
					smart-open>=5.2.1,<7.0.0
 | 
				
			||||||
# Third party dependencies
 | 
					# Third party dependencies
 | 
				
			||||||
| 
						 | 
					@ -30,10 +30,11 @@ pytest-timeout>=1.3.0,<2.0.0
 | 
				
			||||||
mock>=2.0.0,<3.0.0
 | 
					mock>=2.0.0,<3.0.0
 | 
				
			||||||
flake8>=3.8.0,<6.0.0
 | 
					flake8>=3.8.0,<6.0.0
 | 
				
			||||||
hypothesis>=3.27.0,<7.0.0
 | 
					hypothesis>=3.27.0,<7.0.0
 | 
				
			||||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
 | 
					mypy>=0.990,<1.1.0; platform_machine != "aarch64"
 | 
				
			||||||
types-dataclasses>=0.1.3; python_version < "3.7"
 | 
					 | 
				
			||||||
types-mock>=0.1.1
 | 
					types-mock>=0.1.1
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
types-requests
 | 
					types-requests
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
black==22.3.0
 | 
					black==22.3.0
 | 
				
			||||||
 | 
					cython-lint>=0.15.0; python_version >= "3.7"
 | 
				
			||||||
 | 
					isort>=5.0,<6.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										46
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								setup.cfg
									
									
									
									
									
								
							| 
						 | 
					@ -30,6 +30,14 @@ project_urls =
 | 
				
			||||||
zip_safe = false
 | 
					zip_safe = false
 | 
				
			||||||
include_package_data = true
 | 
					include_package_data = true
 | 
				
			||||||
python_requires = >=3.8
 | 
					python_requires = >=3.8
 | 
				
			||||||
 | 
					setup_requires =
 | 
				
			||||||
 | 
					    cython>=0.25,<3.0
 | 
				
			||||||
 | 
					    numpy>=1.15.0
 | 
				
			||||||
 | 
					    # We also need our Cython packages here to compile against
 | 
				
			||||||
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
 | 
					    thinc>=9.0.0.dev2,<9.1.0
 | 
				
			||||||
install_requires =
 | 
					install_requires =
 | 
				
			||||||
    # Our libraries
 | 
					    # Our libraries
 | 
				
			||||||
    spacy-legacy>=4.0.0.dev0,<4.1.0
 | 
					    spacy-legacy>=4.0.0.dev0,<4.1.0
 | 
				
			||||||
| 
						 | 
					@ -42,7 +50,7 @@ install_requires =
 | 
				
			||||||
    srsly>=2.4.3,<3.0.0
 | 
					    srsly>=2.4.3,<3.0.0
 | 
				
			||||||
    catalogue>=2.0.6,<2.1.0
 | 
					    catalogue>=2.0.6,<2.1.0
 | 
				
			||||||
    # Third-party dependencies
 | 
					    # Third-party dependencies
 | 
				
			||||||
    typer>=0.3.0,<0.8.0
 | 
					    typer>=0.3.0,<0.10.0
 | 
				
			||||||
    pathy>=0.10.0
 | 
					    pathy>=0.10.0
 | 
				
			||||||
    smart-open>=5.2.1,<7.0.0
 | 
					    smart-open>=5.2.1,<7.0.0
 | 
				
			||||||
    tqdm>=4.38.0,<5.0.0
 | 
					    tqdm>=4.38.0,<5.0.0
 | 
				
			||||||
| 
						 | 
					@ -67,41 +75,41 @@ transformers =
 | 
				
			||||||
ray =
 | 
					ray =
 | 
				
			||||||
    spacy_ray>=0.1.0,<1.0.0
 | 
					    spacy_ray>=0.1.0,<1.0.0
 | 
				
			||||||
cuda =
 | 
					cuda =
 | 
				
			||||||
    cupy>=5.0.0b4,<12.0.0
 | 
					    cupy>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda80 =
 | 
					cuda80 =
 | 
				
			||||||
    cupy-cuda80>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda80>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda90 =
 | 
					cuda90 =
 | 
				
			||||||
    cupy-cuda90>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda90>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda91 =
 | 
					cuda91 =
 | 
				
			||||||
    cupy-cuda91>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda91>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda92 =
 | 
					cuda92 =
 | 
				
			||||||
    cupy-cuda92>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda92>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda100 =
 | 
					cuda100 =
 | 
				
			||||||
    cupy-cuda100>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda100>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda101 =
 | 
					cuda101 =
 | 
				
			||||||
    cupy-cuda101>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda101>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda102 =
 | 
					cuda102 =
 | 
				
			||||||
    cupy-cuda102>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda102>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda110 =
 | 
					cuda110 =
 | 
				
			||||||
    cupy-cuda110>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda110>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda111 =
 | 
					cuda111 =
 | 
				
			||||||
    cupy-cuda111>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda111>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda112 =
 | 
					cuda112 =
 | 
				
			||||||
    cupy-cuda112>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda112>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda113 =
 | 
					cuda113 =
 | 
				
			||||||
    cupy-cuda113>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda113>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda114 =
 | 
					cuda114 =
 | 
				
			||||||
    cupy-cuda114>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda114>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda115 =
 | 
					cuda115 =
 | 
				
			||||||
    cupy-cuda115>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda115>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda116 =
 | 
					cuda116 =
 | 
				
			||||||
    cupy-cuda116>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda116>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda117 =
 | 
					cuda117 =
 | 
				
			||||||
    cupy-cuda117>=5.0.0b4,<12.0.0
 | 
					    cupy-cuda117>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda11x =
 | 
					cuda11x =
 | 
				
			||||||
    cupy-cuda11x>=11.0.0,<12.0.0
 | 
					    cupy-cuda11x>=11.0.0,<13.0.0
 | 
				
			||||||
cuda-autodetect =
 | 
					cuda-autodetect =
 | 
				
			||||||
    cupy-wheel>=11.0.0,<12.0.0
 | 
					    cupy-wheel>=11.0.0,<13.0.0
 | 
				
			||||||
apple =
 | 
					apple =
 | 
				
			||||||
    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 | 
					    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 | 
				
			||||||
# Language tokenizers with external dependencies
 | 
					# Language tokenizers with external dependencies
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from typing import Union, Iterable, Dict, Any
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Iterable, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# set library-specific custom warning handling before doing anything else
 | 
					# set library-specific custom warning handling before doing anything else
 | 
				
			||||||
from .errors import setup_default_warnings
 | 
					from .errors import setup_default_warnings
 | 
				
			||||||
| 
						 | 
					@ -8,20 +8,17 @@ from .errors import setup_default_warnings
 | 
				
			||||||
setup_default_warnings()  # noqa: E402
 | 
					setup_default_warnings()  # noqa: E402
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These are imported as part of the API
 | 
					# These are imported as part of the API
 | 
				
			||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
 | 
					from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401
 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from . import pipeline  # noqa: F401
 | 
					from . import pipeline  # noqa: F401
 | 
				
			||||||
from .cli.info import info  # noqa: F401
 | 
					 | 
				
			||||||
from .glossary import explain  # noqa: F401
 | 
					 | 
				
			||||||
from .about import __version__  # noqa: F401
 | 
					 | 
				
			||||||
from .util import registry, logger  # noqa: F401
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .errors import Errors
 | 
					 | 
				
			||||||
from .language import Language
 | 
					 | 
				
			||||||
from .vocab import Vocab
 | 
					 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
 | 
					from .about import __version__  # noqa: F401
 | 
				
			||||||
 | 
					from .cli.info import info  # noqa: F401
 | 
				
			||||||
 | 
					from .errors import Errors
 | 
				
			||||||
 | 
					from .glossary import explain  # noqa: F401
 | 
				
			||||||
 | 
					from .language import Language
 | 
				
			||||||
 | 
					from .util import logger, registry  # noqa: F401
 | 
				
			||||||
 | 
					from .vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if sys.maxunicode == 65535:
 | 
					if sys.maxunicode == 65535:
 | 
				
			||||||
    raise SystemError(Errors.E130)
 | 
					    raise SystemError(Errors.E130)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "4.0.0.dev0"
 | 
					__version__ = "4.0.0.dev1"
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
__projects__ = "https://github.com/explosion/projects"
 | 
					__projects__ = "https://github.com/explosion/projects"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
from . cimport symbols
 | 
					from . cimport symbols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef enum attr_id_t:
 | 
					cdef enum attr_id_t:
 | 
				
			||||||
    NULL_ATTR = 0
 | 
					    NULL_ATTR = 0
 | 
				
			||||||
    IS_ALPHA = symbols.IS_ALPHA
 | 
					    IS_ALPHA = symbols.IS_ALPHA
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,35 +1,35 @@
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, setup_cli  # noqa: F401
 | 
					from ._util import app, setup_cli  # noqa: F401
 | 
				
			||||||
 | 
					from .apply import apply  # noqa: F401
 | 
				
			||||||
 | 
					from .assemble import assemble_cli  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 | 
					# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
 | 
				
			||||||
# are registered automatically and won't have to be imported here.
 | 
					# are registered automatically and won't have to be imported here.
 | 
				
			||||||
from .benchmark_speed import benchmark_speed_cli  # noqa: F401
 | 
					from .benchmark_speed import benchmark_speed_cli  # noqa: F401
 | 
				
			||||||
from .download import download  # noqa: F401
 | 
					 | 
				
			||||||
from .info import info  # noqa: F401
 | 
					 | 
				
			||||||
from .package import package  # noqa: F401
 | 
					 | 
				
			||||||
from .profile import profile  # noqa: F401
 | 
					 | 
				
			||||||
from .train import train_cli  # noqa: F401
 | 
					 | 
				
			||||||
from .assemble import assemble_cli  # noqa: F401
 | 
					 | 
				
			||||||
from .pretrain import pretrain  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_data import debug_data  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_config import debug_config  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_model import debug_model  # noqa: F401
 | 
					 | 
				
			||||||
from .debug_diff import debug_diff  # noqa: F401
 | 
					 | 
				
			||||||
from .evaluate import evaluate  # noqa: F401
 | 
					 | 
				
			||||||
from .apply import apply  # noqa: F401
 | 
					 | 
				
			||||||
from .convert import convert  # noqa: F401
 | 
					from .convert import convert  # noqa: F401
 | 
				
			||||||
from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
					from .debug_config import debug_config  # noqa: F401
 | 
				
			||||||
from .init_config import init_config, fill_config  # noqa: F401
 | 
					from .debug_data import debug_data  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .debug_diff import debug_diff  # noqa: F401
 | 
				
			||||||
from .project.clone import project_clone  # noqa: F401
 | 
					from .debug_model import debug_model  # noqa: F401
 | 
				
			||||||
from .project.assets import project_assets  # noqa: F401
 | 
					from .download import download  # noqa: F401
 | 
				
			||||||
from .project.run import project_run  # noqa: F401
 | 
					from .evaluate import evaluate  # noqa: F401
 | 
				
			||||||
from .project.dvc import project_update_dvc  # noqa: F401
 | 
					 | 
				
			||||||
from .project.push import project_push  # noqa: F401
 | 
					 | 
				
			||||||
from .project.pull import project_pull  # noqa: F401
 | 
					 | 
				
			||||||
from .project.document import project_document  # noqa: F401
 | 
					 | 
				
			||||||
from .find_threshold import find_threshold  # noqa: F401
 | 
					from .find_threshold import find_threshold  # noqa: F401
 | 
				
			||||||
 | 
					from .info import info  # noqa: F401
 | 
				
			||||||
 | 
					from .init_config import fill_config, init_config  # noqa: F401
 | 
				
			||||||
 | 
					from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
				
			||||||
 | 
					from .package import package  # noqa: F401
 | 
				
			||||||
 | 
					from .pretrain import pretrain  # noqa: F401
 | 
				
			||||||
 | 
					from .profile import profile  # noqa: F401
 | 
				
			||||||
 | 
					from .project.assets import project_assets  # noqa: F401
 | 
				
			||||||
 | 
					from .project.clone import project_clone  # noqa: F401
 | 
				
			||||||
 | 
					from .project.document import project_document  # noqa: F401
 | 
				
			||||||
 | 
					from .project.dvc import project_update_dvc  # noqa: F401
 | 
				
			||||||
 | 
					from .project.pull import project_pull  # noqa: F401
 | 
				
			||||||
 | 
					from .project.push import project_push  # noqa: F401
 | 
				
			||||||
 | 
					from .project.run import project_run  # noqa: F401
 | 
				
			||||||
 | 
					from .train import train_cli  # noqa: F401
 | 
				
			||||||
 | 
					from .validate import validate  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
					@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,26 +1,45 @@
 | 
				
			||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
 | 
					 | 
				
			||||||
from typing import TYPE_CHECKING, overload
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg, Printer
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import hashlib
 | 
					import hashlib
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					from configparser import InterpolationError
 | 
				
			||||||
 | 
					from contextlib import contextmanager
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import (
 | 
				
			||||||
 | 
					    TYPE_CHECKING,
 | 
				
			||||||
 | 
					    Any,
 | 
				
			||||||
 | 
					    Dict,
 | 
				
			||||||
 | 
					    Iterable,
 | 
				
			||||||
 | 
					    List,
 | 
				
			||||||
 | 
					    Literal,
 | 
				
			||||||
 | 
					    Optional,
 | 
				
			||||||
 | 
					    Tuple,
 | 
				
			||||||
 | 
					    Union,
 | 
				
			||||||
 | 
					    overload,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
from click import NoSuchOption
 | 
					from click import NoSuchOption
 | 
				
			||||||
from click.parser import split_arg_string
 | 
					from click.parser import split_arg_string
 | 
				
			||||||
from typer.main import get_command
 | 
					 | 
				
			||||||
from contextlib import contextmanager
 | 
					 | 
				
			||||||
from thinc.api import Config, ConfigValidationError, require_gpu
 | 
					from thinc.api import Config, ConfigValidationError, require_gpu
 | 
				
			||||||
from thinc.util import gpu_is_available
 | 
					from thinc.util import gpu_is_available
 | 
				
			||||||
from configparser import InterpolationError
 | 
					from typer.main import get_command
 | 
				
			||||||
import os
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..schemas import ProjectConfigSchema, validate
 | 
					 | 
				
			||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
 | 
					 | 
				
			||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
 | 
					 | 
				
			||||||
from ..errors import RENAMED_LANGUAGE_CODES
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
 | 
					from ..errors import RENAMED_LANGUAGE_CODES
 | 
				
			||||||
 | 
					from ..schemas import ProjectConfigSchema, validate
 | 
				
			||||||
 | 
					from ..util import (
 | 
				
			||||||
 | 
					    ENV_VARS,
 | 
				
			||||||
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
 | 
					    import_file,
 | 
				
			||||||
 | 
					    is_compatible_version,
 | 
				
			||||||
 | 
					    logger,
 | 
				
			||||||
 | 
					    make_tempdir,
 | 
				
			||||||
 | 
					    registry,
 | 
				
			||||||
 | 
					    run_command,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
    from pathy import FluidPath  # noqa: F401
 | 
					    from pathy import FluidPath  # noqa: F401
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,15 @@
 | 
				
			||||||
import tqdm
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from itertools import chain
 | 
					from itertools import chain
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Optional, List, Iterable, cast, Union
 | 
					from typing import Iterable, List, Optional, Union, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokens import Doc, DocBin
 | 
					from ..tokens import Doc, DocBin
 | 
				
			||||||
from ..vocab import Vocab
 | 
					 | 
				
			||||||
from ..util import ensure_path, load_model
 | 
					from ..util import ensure_path, load_model
 | 
				
			||||||
 | 
					from ..vocab import Vocab
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
path_help = """Location of the documents to predict on.
 | 
					path_help = """Location of the documents to predict on.
 | 
				
			||||||
Can be a single file in .spacy format or a .jsonl file.
 | 
					Can be a single file in .spacy format or a .jsonl file.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,20 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					 | 
				
			||||||
from ._util import import_code
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..util import get_sourced_components, load_model_from_config
 | 
					from ..util import get_sourced_components, load_model_from_config
 | 
				
			||||||
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,12 @@
 | 
				
			||||||
from typing import Iterable, List, Optional
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from itertools import islice
 | 
					 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from tqdm import tqdm
 | 
					from itertools import islice
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Iterable, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from tqdm import tqdm
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,22 @@
 | 
				
			||||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
 | 
					import itertools
 | 
				
			||||||
from enum import Enum
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					from enum import Enum
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Callable, Iterable, Mapping, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
 | 
					 | 
				
			||||||
from ..training import docs_to_json
 | 
					 | 
				
			||||||
from ..tokens import Doc, DocBin
 | 
					from ..tokens import Doc, DocBin
 | 
				
			||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
 | 
					from ..training import docs_to_json
 | 
				
			||||||
from ..training.converters import conllu_to_docs
 | 
					from ..training.converters import (
 | 
				
			||||||
 | 
					    conll_ner_to_docs,
 | 
				
			||||||
 | 
					    conllu_to_docs,
 | 
				
			||||||
 | 
					    iob_to_docs,
 | 
				
			||||||
 | 
					    json_to_docs,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Converters are matched by file extension except for ner/iob, which are
 | 
					# Converters are matched by file extension except for ner/iob, which are
 | 
				
			||||||
# matched by file extension and content. To add a converter, add a new
 | 
					# matched by file extension and content. To add a converter, add a new
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,22 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union, List
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, table
 | 
					from typing import Any, Dict, List, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
from thinc.config import VARIABLE_RE
 | 
					from thinc.config import VARIABLE_RE
 | 
				
			||||||
import typer
 | 
					from wasabi import msg, table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
 | 
					from .. import util
 | 
				
			||||||
from ._util import import_code, debug_cli
 | 
					 | 
				
			||||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
 | 
					from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
from .. import util
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    debug_cli,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,29 +1,49 @@
 | 
				
			||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
 | 
					 | 
				
			||||||
from typing import Literal, cast, overload
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from collections import Counter
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
from wasabi import Printer, MESSAGES, msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					from collections import Counter
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import (
 | 
				
			||||||
 | 
					    Any,
 | 
				
			||||||
 | 
					    Dict,
 | 
				
			||||||
 | 
					    Iterable,
 | 
				
			||||||
 | 
					    List,
 | 
				
			||||||
 | 
					    Literal,
 | 
				
			||||||
 | 
					    Optional,
 | 
				
			||||||
 | 
					    Sequence,
 | 
				
			||||||
 | 
					    Set,
 | 
				
			||||||
 | 
					    Tuple,
 | 
				
			||||||
 | 
					    Union,
 | 
				
			||||||
 | 
					    cast,
 | 
				
			||||||
 | 
					    overload,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
 | 
					import numpy
 | 
				
			||||||
from ._util import import_code, debug_cli, _format_number
 | 
					import srsly
 | 
				
			||||||
from ..training import Example, remove_bilu_prefix
 | 
					import typer
 | 
				
			||||||
from ..training.initialize import get_sourced_components
 | 
					from wasabi import MESSAGES, Printer, msg
 | 
				
			||||||
from ..schemas import ConfigSchemaTraining
 | 
					
 | 
				
			||||||
from ..pipeline import TrainablePipe
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..language import Language
 | 
				
			||||||
 | 
					from ..morphology import Morphology
 | 
				
			||||||
 | 
					from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
 | 
				
			||||||
 | 
					from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 | 
				
			||||||
from ..pipeline._parser_internals import nonproj
 | 
					from ..pipeline._parser_internals import nonproj
 | 
				
			||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
 | 
					from ..pipeline._parser_internals.nonproj import DELIMITER
 | 
				
			||||||
from ..pipeline import Morphologizer, SpanCategorizer
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
 | 
					from ..training import Example, remove_bilu_prefix
 | 
				
			||||||
from ..morphology import Morphology
 | 
					from ..training.initialize import get_sourced_components
 | 
				
			||||||
from ..language import Language
 | 
					 | 
				
			||||||
from ..util import registry, resolve_dot_names
 | 
					from ..util import registry, resolve_dot_names
 | 
				
			||||||
from ..vectors import Mode as VectorsMode
 | 
					from ..vectors import Mode as VectorsMode
 | 
				
			||||||
from .. import util
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    _format_number,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    debug_cli,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Minimum number of expected occurrences of NER label in data to train new label
 | 
					# Minimum number of expected occurrences of NER label in data to train new label
 | 
				
			||||||
NEW_LABEL_THRESHOLD = 50
 | 
					NEW_LABEL_THRESHOLD = 50
 | 
				
			||||||
| 
						 | 
					@ -210,7 +230,7 @@ def debug_data(
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        msg.info("No word vectors present in the package")
 | 
					        msg.info("No word vectors present in the package")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "spancat" in factory_names:
 | 
					    if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
 | 
				
			||||||
        model_labels_spancat = _get_labels_from_spancat(nlp)
 | 
					        model_labels_spancat = _get_labels_from_spancat(nlp)
 | 
				
			||||||
        has_low_data_warning = False
 | 
					        has_low_data_warning = False
 | 
				
			||||||
        has_no_neg_warning = False
 | 
					        has_no_neg_warning = False
 | 
				
			||||||
| 
						 | 
					@ -335,7 +355,7 @@ def debug_data(
 | 
				
			||||||
                show=verbose,
 | 
					                show=verbose,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            msg.good("Examples without ocurrences available for all labels")
 | 
					            msg.good("Examples without occurrences available for all labels")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "ner" in factory_names:
 | 
					    if "ner" in factory_names:
 | 
				
			||||||
        # Get all unique NER labels present in the data
 | 
					        # Get all unique NER labels present in the data
 | 
				
			||||||
| 
						 | 
					@ -520,9 +540,13 @@ def debug_data(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if "tagger" in factory_names:
 | 
					    if "tagger" in factory_names:
 | 
				
			||||||
        msg.divider("Part-of-speech Tagging")
 | 
					        msg.divider("Part-of-speech Tagging")
 | 
				
			||||||
        label_list = [label for label in gold_train_data["tags"]]
 | 
					        label_list, counts = zip(*gold_train_data["tags"].items())
 | 
				
			||||||
        model_labels = _get_labels_from_model(nlp, "tagger")
 | 
					 | 
				
			||||||
        msg.info(f"{len(label_list)} label(s) in train data")
 | 
					        msg.info(f"{len(label_list)} label(s) in train data")
 | 
				
			||||||
 | 
					        p = numpy.array(counts)
 | 
				
			||||||
 | 
					        p = p / p.sum()
 | 
				
			||||||
 | 
					        norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
 | 
				
			||||||
 | 
					        msg.info(f"{norm_entropy} is the normalised label entropy")
 | 
				
			||||||
 | 
					        model_labels = _get_labels_from_model(nlp, "tagger")
 | 
				
			||||||
        labels = set(label_list)
 | 
					        labels = set(label_list)
 | 
				
			||||||
        missing_labels = model_labels - labels
 | 
					        missing_labels = model_labels - labels
 | 
				
			||||||
        if missing_labels:
 | 
					        if missing_labels:
 | 
				
			||||||
| 
						 | 
					@ -824,7 +848,7 @@ def _compile_gold(
 | 
				
			||||||
                    data["boundary_cross_ents"] += 1
 | 
					                    data["boundary_cross_ents"] += 1
 | 
				
			||||||
                elif label == "-":
 | 
					                elif label == "-":
 | 
				
			||||||
                    data["ner"]["-"] += 1
 | 
					                    data["ner"]["-"] += 1
 | 
				
			||||||
        if "spancat" in factory_names:
 | 
					        if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
 | 
				
			||||||
            for spans_key in list(eg.reference.spans.keys()):
 | 
					            for spans_key in list(eg.reference.spans.keys()):
 | 
				
			||||||
                # Obtain the span frequency
 | 
					                # Obtain the span frequency
 | 
				
			||||||
                if spans_key not in data["spancat"]:
 | 
					                if spans_key not in data["spancat"]:
 | 
				
			||||||
| 
						 | 
					@ -1022,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
 | 
				
			||||||
    pipe_names = [
 | 
					    pipe_names = [
 | 
				
			||||||
        pipe_name
 | 
					        pipe_name
 | 
				
			||||||
        for pipe_name in nlp.pipe_names
 | 
					        for pipe_name in nlp.pipe_names
 | 
				
			||||||
        if nlp.get_pipe_meta(pipe_name).factory == "spancat"
 | 
					        if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    labels: Dict[str, Set[str]] = {}
 | 
					    labels: Dict[str, Set[str]] = {}
 | 
				
			||||||
    for pipe_name in pipe_names:
 | 
					    for pipe_name in pipe_names:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,13 @@
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Optional
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
from wasabi import Printer, diff_strings, MarkdownRenderer
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					from wasabi import MarkdownRenderer, Printer, diff_strings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
 | 
					 | 
				
			||||||
from ..util import load_config
 | 
					from ..util import load_config
 | 
				
			||||||
from .init_config import init_config, Optimizations
 | 
					from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
 | 
				
			||||||
 | 
					from .init_config import Optimizations, init_config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,19 +1,32 @@
 | 
				
			||||||
from typing import Dict, Any, Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from thinc.api import (
 | 
				
			||||||
 | 
					    Model,
 | 
				
			||||||
 | 
					    data_validation,
 | 
				
			||||||
 | 
					    fix_random_seed,
 | 
				
			||||||
 | 
					    set_dropout_rate,
 | 
				
			||||||
 | 
					    set_gpu_allocator,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.training import Example
 | 
					from spacy.training import Example
 | 
				
			||||||
from spacy.util import resolve_dot_names
 | 
					from spacy.util import resolve_dot_names
 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
from thinc.api import fix_random_seed, set_dropout_rate
 | 
					 | 
				
			||||||
from thinc.api import Model, data_validation, set_gpu_allocator
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
 | 
					from .. import util
 | 
				
			||||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
 | 
					 | 
				
			||||||
from ..schemas import ConfigSchemaTraining
 | 
					from ..schemas import ConfigSchemaTraining
 | 
				
			||||||
from ..util import registry
 | 
					from ..util import registry
 | 
				
			||||||
from .. import util
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    debug_cli,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					    string_to_list,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command(
 | 
					@debug_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,20 @@
 | 
				
			||||||
from typing import Optional, Sequence
 | 
					 | 
				
			||||||
import requests
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
from wasabi import msg
 | 
					from typing import Optional, Sequence
 | 
				
			||||||
import typer
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
from ..util import is_package, get_minor_version, run_command
 | 
					from ..util import (
 | 
				
			||||||
from ..util import is_prerelease_version, get_installed_models
 | 
					    get_installed_models,
 | 
				
			||||||
from ..util import get_package_version
 | 
					    get_minor_version,
 | 
				
			||||||
 | 
					    get_package_version,
 | 
				
			||||||
 | 
					    is_package,
 | 
				
			||||||
 | 
					    is_prerelease_version,
 | 
				
			||||||
 | 
					    run_command,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					@ -83,11 +89,8 @@ def download(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
 | 
					def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
 | 
				
			||||||
    dl_tpl = "{m}-{v}/{m}-{v}{s}"
 | 
					    dl_tpl = "{m}-{v}/{m}-{v}{s}"
 | 
				
			||||||
    egg_tpl = "#egg={m}=={v}"
 | 
					 | 
				
			||||||
    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
 | 
					    suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
 | 
				
			||||||
    filename = dl_tpl.format(m=model_name, v=version, s=suffix)
 | 
					    filename = dl_tpl.format(m=model_name, v=version, s=suffix)
 | 
				
			||||||
    if sdist:
 | 
					 | 
				
			||||||
        filename += egg_tpl.format(m=model_name, v=version)
 | 
					 | 
				
			||||||
    return filename
 | 
					    return filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,16 @@
 | 
				
			||||||
from typing import Optional, List, Dict, Any, Union
 | 
					 | 
				
			||||||
from wasabi import Printer
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, List, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from thinc.api import fix_random_seed
 | 
					from thinc.api import fix_random_seed
 | 
				
			||||||
 | 
					from wasabi import Printer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..training import Corpus
 | 
					from .. import displacy, util
 | 
				
			||||||
from ..tokens import Doc
 | 
					 | 
				
			||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
 | 
					 | 
				
			||||||
from ..scorer import Scorer
 | 
					from ..scorer import Scorer
 | 
				
			||||||
from .. import util
 | 
					from ..tokens import Doc
 | 
				
			||||||
from .. import displacy
 | 
					from ..training import Corpus
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@benchmark_cli.command(
 | 
					@benchmark_cli.command(
 | 
				
			||||||
| 
						 | 
					@ -27,6 +27,7 @@ def evaluate_cli(
 | 
				
			||||||
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
					    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
				
			||||||
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
					    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
				
			||||||
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
					    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
				
			||||||
 | 
					    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -50,6 +51,7 @@ def evaluate_cli(
 | 
				
			||||||
        gold_preproc=gold_preproc,
 | 
					        gold_preproc=gold_preproc,
 | 
				
			||||||
        displacy_path=displacy_path,
 | 
					        displacy_path=displacy_path,
 | 
				
			||||||
        displacy_limit=displacy_limit,
 | 
					        displacy_limit=displacy_limit,
 | 
				
			||||||
 | 
					        per_component=per_component,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,6 +66,7 @@ def evaluate(
 | 
				
			||||||
    displacy_limit: int = 25,
 | 
					    displacy_limit: int = 25,
 | 
				
			||||||
    silent: bool = True,
 | 
					    silent: bool = True,
 | 
				
			||||||
    spans_key: str = "sc",
 | 
					    spans_key: str = "sc",
 | 
				
			||||||
 | 
					    per_component: bool = False,
 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					) -> Dict[str, Any]:
 | 
				
			||||||
    msg = Printer(no_print=silent, pretty=not silent)
 | 
					    msg = Printer(no_print=silent, pretty=not silent)
 | 
				
			||||||
    fix_random_seed()
 | 
					    fix_random_seed()
 | 
				
			||||||
| 
						 | 
					@ -78,50 +81,61 @@ def evaluate(
 | 
				
			||||||
    corpus = Corpus(data_path, gold_preproc=gold_preproc)
 | 
					    corpus = Corpus(data_path, gold_preproc=gold_preproc)
 | 
				
			||||||
    nlp = util.load_model(model)
 | 
					    nlp = util.load_model(model)
 | 
				
			||||||
    dev_dataset = list(corpus(nlp))
 | 
					    dev_dataset = list(corpus(nlp))
 | 
				
			||||||
    scores = nlp.evaluate(dev_dataset)
 | 
					    scores = nlp.evaluate(dev_dataset, per_component=per_component)
 | 
				
			||||||
    metrics = {
 | 
					    if per_component:
 | 
				
			||||||
        "TOK": "token_acc",
 | 
					        data = scores
 | 
				
			||||||
        "TAG": "tag_acc",
 | 
					        if output is None:
 | 
				
			||||||
        "POS": "pos_acc",
 | 
					            msg.warn(
 | 
				
			||||||
        "MORPH": "morph_acc",
 | 
					                "The per-component option is enabled but there is no output JSON file provided to save the scores to."
 | 
				
			||||||
        "LEMMA": "lemma_acc",
 | 
					            )
 | 
				
			||||||
        "UAS": "dep_uas",
 | 
					        else:
 | 
				
			||||||
        "LAS": "dep_las",
 | 
					            msg.info("Per-component scores will be saved to output JSON file.")
 | 
				
			||||||
        "NER P": "ents_p",
 | 
					    else:
 | 
				
			||||||
        "NER R": "ents_r",
 | 
					        metrics = {
 | 
				
			||||||
        "NER F": "ents_f",
 | 
					            "TOK": "token_acc",
 | 
				
			||||||
        "TEXTCAT": "cats_score",
 | 
					            "TAG": "tag_acc",
 | 
				
			||||||
        "SENT P": "sents_p",
 | 
					            "POS": "pos_acc",
 | 
				
			||||||
        "SENT R": "sents_r",
 | 
					            "MORPH": "morph_acc",
 | 
				
			||||||
        "SENT F": "sents_f",
 | 
					            "LEMMA": "lemma_acc",
 | 
				
			||||||
        "SPAN P": f"spans_{spans_key}_p",
 | 
					            "UAS": "dep_uas",
 | 
				
			||||||
        "SPAN R": f"spans_{spans_key}_r",
 | 
					            "LAS": "dep_las",
 | 
				
			||||||
        "SPAN F": f"spans_{spans_key}_f",
 | 
					            "NER P": "ents_p",
 | 
				
			||||||
        "SPEED": "speed",
 | 
					            "NER R": "ents_r",
 | 
				
			||||||
    }
 | 
					            "NER F": "ents_f",
 | 
				
			||||||
    results = {}
 | 
					            "TEXTCAT": "cats_score",
 | 
				
			||||||
    data = {}
 | 
					            "SENT P": "sents_p",
 | 
				
			||||||
    for metric, key in metrics.items():
 | 
					            "SENT R": "sents_r",
 | 
				
			||||||
        if key in scores:
 | 
					            "SENT F": "sents_f",
 | 
				
			||||||
            if key == "cats_score":
 | 
					            "SPAN P": f"spans_{spans_key}_p",
 | 
				
			||||||
                metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
 | 
					            "SPAN R": f"spans_{spans_key}_r",
 | 
				
			||||||
            if isinstance(scores[key], (int, float)):
 | 
					            "SPAN F": f"spans_{spans_key}_f",
 | 
				
			||||||
                if key == "speed":
 | 
					            "SPEED": "speed",
 | 
				
			||||||
                    results[metric] = f"{scores[key]:.0f}"
 | 
					        }
 | 
				
			||||||
 | 
					        results = {}
 | 
				
			||||||
 | 
					        data = {}
 | 
				
			||||||
 | 
					        for metric, key in metrics.items():
 | 
				
			||||||
 | 
					            if key in scores:
 | 
				
			||||||
 | 
					                if key == "cats_score":
 | 
				
			||||||
 | 
					                    metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
 | 
				
			||||||
 | 
					                if isinstance(scores[key], (int, float)):
 | 
				
			||||||
 | 
					                    if key == "speed":
 | 
				
			||||||
 | 
					                        results[metric] = f"{scores[key]:.0f}"
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        results[metric] = f"{scores[key]*100:.2f}"
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    results[metric] = f"{scores[key]*100:.2f}"
 | 
					                    results[metric] = "-"
 | 
				
			||||||
            else:
 | 
					                data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
 | 
				
			||||||
                results[metric] = "-"
 | 
					 | 
				
			||||||
            data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.table(results, title="Results")
 | 
					        msg.table(results, title="Results")
 | 
				
			||||||
    data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
 | 
					        data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if displacy_path:
 | 
					    if displacy_path:
 | 
				
			||||||
        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
					        factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
 | 
				
			||||||
        docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
 | 
					        docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
 | 
				
			||||||
        render_deps = "parser" in factory_names
 | 
					        render_deps = "parser" in factory_names
 | 
				
			||||||
        render_ents = "ner" in factory_names
 | 
					        render_ents = "ner" in factory_names
 | 
				
			||||||
 | 
					        render_spans = "spancat" in factory_names
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        render_parses(
 | 
					        render_parses(
 | 
				
			||||||
            docs,
 | 
					            docs,
 | 
				
			||||||
            displacy_path,
 | 
					            displacy_path,
 | 
				
			||||||
| 
						 | 
					@ -129,6 +143,7 @@ def evaluate(
 | 
				
			||||||
            limit=displacy_limit,
 | 
					            limit=displacy_limit,
 | 
				
			||||||
            deps=render_deps,
 | 
					            deps=render_deps,
 | 
				
			||||||
            ents=render_ents,
 | 
					            ents=render_ents,
 | 
				
			||||||
 | 
					            spans=render_spans,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
					        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -182,6 +197,7 @@ def render_parses(
 | 
				
			||||||
    limit: int = 250,
 | 
					    limit: int = 250,
 | 
				
			||||||
    deps: bool = True,
 | 
					    deps: bool = True,
 | 
				
			||||||
    ents: bool = True,
 | 
					    ents: bool = True,
 | 
				
			||||||
 | 
					    spans: bool = True,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    docs[0].user_data["title"] = model_name
 | 
					    docs[0].user_data["title"] = model_name
 | 
				
			||||||
    if ents:
 | 
					    if ents:
 | 
				
			||||||
| 
						 | 
					@ -195,6 +211,11 @@ def render_parses(
 | 
				
			||||||
        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
 | 
					        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
 | 
				
			||||||
            file_.write(html)
 | 
					            file_.write(html)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if spans:
 | 
				
			||||||
 | 
					        html = displacy.render(docs[:limit], style="span", page=True)
 | 
				
			||||||
 | 
					        with (output_path / "spans.html").open("w", encoding="utf8") as file_:
 | 
				
			||||||
 | 
					            file_.write(html)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def print_prf_per_type(
 | 
					def print_prf_per_type(
 | 
				
			||||||
    msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
 | 
					    msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,17 @@
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
 | 
					import logging
 | 
				
			||||||
import operator
 | 
					import operator
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
import logging
 | 
					from typing import Any, Dict, List, Optional, Tuple
 | 
				
			||||||
from typing import Optional, Tuple, Any, Dict, List
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import wasabi.tables
 | 
					import wasabi.tables
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
 | 
					 | 
				
			||||||
from ..errors import Errors
 | 
					 | 
				
			||||||
from ..training import Corpus
 | 
					 | 
				
			||||||
from ._util import app, Arg, Opt, import_code, setup_gpu
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
 | 
				
			||||||
 | 
					from ..training import Corpus
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_DEFAULTS = {
 | 
					_DEFAULTS = {
 | 
				
			||||||
    "n_trials": 11,
 | 
					    "n_trials": 11,
 | 
				
			||||||
| 
						 | 
					@ -35,7 +35,7 @@ def find_threshold_cli(
 | 
				
			||||||
    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
					    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
    use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
					    use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
				
			||||||
    gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
					    gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
				
			||||||
    verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,15 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union, List
 | 
					 | 
				
			||||||
import platform
 | 
					 | 
				
			||||||
import json
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import Printer, MarkdownRenderer
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import importlib.metadata
 | 
					import importlib.metadata
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import platform
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, List, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, string_to_list
 | 
					import srsly
 | 
				
			||||||
from .download import get_model_filename, get_latest_version
 | 
					from wasabi import MarkdownRenderer, Printer
 | 
				
			||||||
from .. import util
 | 
					
 | 
				
			||||||
from .. import about
 | 
					from .. import about, util
 | 
				
			||||||
 | 
					from ._util import Arg, Opt, app, string_to_list
 | 
				
			||||||
 | 
					from .download import get_latest_version, get_model_filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("info")
 | 
					@app.command("info")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,19 +1,27 @@
 | 
				
			||||||
from typing import Optional, List, Tuple
 | 
					import re
 | 
				
			||||||
from enum import Enum
 | 
					from enum import Enum
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import Printer, diff_strings
 | 
					from typing import List, Optional, Tuple
 | 
				
			||||||
from thinc.api import Config
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
from jinja2 import Template
 | 
					from jinja2 import Template
 | 
				
			||||||
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					from wasabi import Printer, diff_strings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 | 
					from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
 | 
				
			||||||
from ..schemas import RecommendationSchema
 | 
					from ..schemas import RecommendationSchema
 | 
				
			||||||
from ..util import SimpleFrozenList
 | 
					from ..util import SimpleFrozenList
 | 
				
			||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
 | 
					from ._util import (
 | 
				
			||||||
from ._util import string_to_list, import_code, _handle_renamed_language_codes
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    _handle_renamed_language_codes,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    init_cli,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					    string_to_list,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ROOT = Path(__file__).parent / "templates"
 | 
					ROOT = Path(__file__).parent / "templates"
 | 
				
			||||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
 | 
					TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,24 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from typing import Optional
 | 
				
			||||||
import typer
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..training.initialize import init_nlp, convert_vectors
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					from ..training.initialize import convert_vectors, init_nlp
 | 
				
			||||||
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    _handle_renamed_language_codes,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    init_cli,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@init_cli.command("vectors")
 | 
					@init_cli.command("vectors")
 | 
				
			||||||
| 
						 | 
					@ -23,6 +32,7 @@ def init_vectors_cli(
 | 
				
			||||||
    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
 | 
					    mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
 | 
				
			||||||
    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
 | 
				
			||||||
    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | 
					    jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
 | 
				
			||||||
 | 
					    attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
					    """Convert word vectors for use with spaCy. Will export an nlp object that
 | 
				
			||||||
| 
						 | 
					@ -44,6 +54,7 @@ def init_vectors_cli(
 | 
				
			||||||
        truncate=truncate,
 | 
					        truncate=truncate,
 | 
				
			||||||
        prune=prune,
 | 
					        prune=prune,
 | 
				
			||||||
        mode=mode,
 | 
					        mode=mode,
 | 
				
			||||||
 | 
					        attr=attr,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
					    msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
 | 
				
			||||||
    nlp.to_disk(output_dir)
 | 
					    nlp.to_disk(output_dir)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,18 @@
 | 
				
			||||||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
 | 
					 | 
				
			||||||
from thinc.api import Config
 | 
					 | 
				
			||||||
from collections import defaultdict
 | 
					 | 
				
			||||||
from catalogue import RegistryError
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					from collections import defaultdict
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, List, Optional, Tuple, Union, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
 | 
					import srsly
 | 
				
			||||||
from ..schemas import validate, ModelMetaSchema
 | 
					from catalogue import RegistryError
 | 
				
			||||||
from .. import util
 | 
					from thinc.api import Config
 | 
				
			||||||
from .. import about
 | 
					from wasabi import MarkdownRenderer, Printer, get_raw_input
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .. import about, util
 | 
				
			||||||
 | 
					from ..schemas import ModelMetaSchema, validate
 | 
				
			||||||
 | 
					from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("package")
 | 
					@app.command("package")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,21 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					 | 
				
			||||||
from ._util import import_code, setup_gpu
 | 
					 | 
				
			||||||
from ..training.pretrain import pretrain
 | 
					from ..training.pretrain import pretrain
 | 
				
			||||||
from ..util import load_config
 | 
					from ..util import load_config
 | 
				
			||||||
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					@ -23,6 +31,7 @@ def pretrain_cli(
 | 
				
			||||||
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
 | 
					    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
 | 
				
			||||||
    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
 | 
					    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
 | 
				
			||||||
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
					    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
				
			||||||
 | 
					    skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -74,6 +83,7 @@ def pretrain_cli(
 | 
				
			||||||
        epoch_resume=epoch_resume,
 | 
					        epoch_resume=epoch_resume,
 | 
				
			||||||
        use_gpu=use_gpu,
 | 
					        use_gpu=use_gpu,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
 | 
					        skip_last=skip_last,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    msg.good("Successfully finished pretrain")
 | 
					    msg.good("Successfully finished pretrain")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,18 @@
 | 
				
			||||||
from typing import Optional, Sequence, Union, Iterator
 | 
					 | 
				
			||||||
import tqdm
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import cProfile
 | 
					import cProfile
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
import pstats
 | 
					import pstats
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, Printer
 | 
					from typing import Iterator, Optional, Sequence, Union
 | 
				
			||||||
import typer
 | 
					
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, debug_cli, Arg, Opt, NAME
 | 
					 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..util import load_model
 | 
					from ..util import load_model
 | 
				
			||||||
 | 
					from ._util import NAME, Arg, Opt, app, debug_cli
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@debug_cli.command("profile")
 | 
					@debug_cli.command("profile")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,16 +1,27 @@
 | 
				
			||||||
from typing import Any, Dict, Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...util import ensure_path, working_dir
 | 
					from ...util import ensure_path, working_dir
 | 
				
			||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
 | 
					from .._util import (
 | 
				
			||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
 | 
					    PROJECT_FILE,
 | 
				
			||||||
from .._util import SimpleFrozenDict, parse_config_overrides
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
 | 
					    download_file,
 | 
				
			||||||
 | 
					    get_checksum,
 | 
				
			||||||
 | 
					    get_git_version,
 | 
				
			||||||
 | 
					    git_checkout,
 | 
				
			||||||
 | 
					    load_project_config,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Whether assets are extra if `extra` is not set.
 | 
					# Whether assets are extra if `extra` is not set.
 | 
				
			||||||
EXTRA_DEFAULT = False
 | 
					EXTRA_DEFAULT = False
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,22 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ... import about
 | 
					from ... import about
 | 
				
			||||||
from ...util import ensure_path
 | 
					from ...util import ensure_path
 | 
				
			||||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
 | 
					from .._util import (
 | 
				
			||||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    PROJECT_FILE,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    get_git_version,
 | 
				
			||||||
 | 
					    git_checkout,
 | 
				
			||||||
 | 
					    git_repo_branch_exists,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_REPO = about.__projects__
 | 
					DEFAULT_REPO = about.__projects__
 | 
				
			||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 | 
					DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg, MarkdownRenderer
 | 
					
 | 
				
			||||||
 | 
					from wasabi import MarkdownRenderer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...util import working_dir
 | 
					from ...util import working_dir
 | 
				
			||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
 | 
					from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
DOCS_URL = "https://spacy.io"
 | 
					DOCS_URL = "https://spacy.io"
 | 
				
			||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 | 
					INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,28 @@
 | 
				
			||||||
"""This module contains helpers and subcommands for integrating spaCy projects
 | 
					"""This module contains helpers and subcommands for integrating spaCy projects
 | 
				
			||||||
with Data Version Controk (DVC). https://dvc.org"""
 | 
					with Data Version Controk (DVC). https://dvc.org"""
 | 
				
			||||||
from typing import Dict, Any, List, Optional, Iterable
 | 
					 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Iterable, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
 | 
					from ...util import (
 | 
				
			||||||
from .._util import Arg, Opt, NAME, COMMAND
 | 
					    SimpleFrozenList,
 | 
				
			||||||
from ...util import working_dir, split_command, join_command, run_command
 | 
					    join_command,
 | 
				
			||||||
from ...util import SimpleFrozenList
 | 
					    run_command,
 | 
				
			||||||
 | 
					    split_command,
 | 
				
			||||||
 | 
					    working_dir,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from .._util import (
 | 
				
			||||||
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    NAME,
 | 
				
			||||||
 | 
					    PROJECT_FILE,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    get_hash,
 | 
				
			||||||
 | 
					    load_project_config,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DVC_CONFIG = "dvc.yaml"
 | 
					DVC_CONFIG = "dvc.yaml"
 | 
				
			||||||
DVC_DIR = ".dvc"
 | 
					DVC_DIR = ".dvc"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
from .remote_storage import RemoteStorage
 | 
					
 | 
				
			||||||
from .remote_storage import get_command_hash
 | 
					from .._util import Arg, load_project_config, logger, project_cli
 | 
				
			||||||
from .._util import project_cli, Arg, logger
 | 
					from .remote_storage import RemoteStorage, get_command_hash
 | 
				
			||||||
from .._util import load_project_config
 | 
					 | 
				
			||||||
from .run import update_lockfile
 | 
					from .run import update_lockfile
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
from .remote_storage import RemoteStorage
 | 
					
 | 
				
			||||||
from .remote_storage import get_content_hash, get_command_hash
 | 
					from .._util import Arg, load_project_config, logger, project_cli
 | 
				
			||||||
from .._util import load_project_config
 | 
					from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
 | 
				
			||||||
from .._util import project_cli, Arg, logger
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@project_cli.command("push")
 | 
					@project_cli.command("push")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,25 @@
 | 
				
			||||||
from typing import Optional, List, Dict, TYPE_CHECKING
 | 
					import hashlib
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import site
 | 
					import site
 | 
				
			||||||
import hashlib
 | 
					 | 
				
			||||||
import urllib.parse
 | 
					 | 
				
			||||||
import tarfile
 | 
					import tarfile
 | 
				
			||||||
 | 
					import urllib.parse
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import TYPE_CHECKING, Dict, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .._util import get_hash, get_checksum, upload_file, download_file
 | 
					 | 
				
			||||||
from .._util import ensure_pathy, make_tempdir
 | 
					 | 
				
			||||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
 | 
					 | 
				
			||||||
from ...git_info import GIT_VERSION
 | 
					 | 
				
			||||||
from ... import about
 | 
					from ... import about
 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...git_info import GIT_VERSION
 | 
				
			||||||
 | 
					from ...util import ENV_VARS, check_bool_env_var, get_minor_version
 | 
				
			||||||
 | 
					from .._util import (
 | 
				
			||||||
 | 
					    download_file,
 | 
				
			||||||
 | 
					    ensure_pathy,
 | 
				
			||||||
 | 
					    get_checksum,
 | 
				
			||||||
 | 
					    get_hash,
 | 
				
			||||||
 | 
					    make_tempdir,
 | 
				
			||||||
 | 
					    upload_file,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
    from pathy import FluidPath  # noqa: F401
 | 
					    from pathy import FluidPath  # noqa: F401
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,20 +1,39 @@
 | 
				
			||||||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
 | 
					 | 
				
			||||||
import os.path
 | 
					import os.path
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
from wasabi.util import locale_escape
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					from wasabi.util import locale_escape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ... import about
 | 
					from ... import about
 | 
				
			||||||
from ...git_info import GIT_VERSION
 | 
					from ...git_info import GIT_VERSION
 | 
				
			||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
 | 
					from ...util import (
 | 
				
			||||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
 | 
					    ENV_VARS,
 | 
				
			||||||
from ...util import check_bool_env_var, SimpleFrozenDict
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
 | 
					    SimpleFrozenList,
 | 
				
			||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
 | 
					    check_bool_env_var,
 | 
				
			||||||
 | 
					    is_cwd,
 | 
				
			||||||
 | 
					    is_minor_version_match,
 | 
				
			||||||
 | 
					    join_command,
 | 
				
			||||||
 | 
					    run_command,
 | 
				
			||||||
 | 
					    split_command,
 | 
				
			||||||
 | 
					    working_dir,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from .._util import (
 | 
				
			||||||
 | 
					    COMMAND,
 | 
				
			||||||
 | 
					    PROJECT_FILE,
 | 
				
			||||||
 | 
					    PROJECT_LOCK,
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    get_checksum,
 | 
				
			||||||
 | 
					    get_hash,
 | 
				
			||||||
 | 
					    load_project_config,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    project_cli,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@project_cli.command(
 | 
					@project_cli.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
 | 
				
			||||||
can help generate the best possible configuration, given a user's requirements. #}
 | 
					can help generate the best possible configuration, given a user's requirements. #}
 | 
				
			||||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 | 
					{%- set use_transformer = hardware != "cpu" and transformer_data -%}
 | 
				
			||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 | 
					{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 | 
				
			||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
 | 
					{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
 | 
				
			||||||
[paths]
 | 
					[paths]
 | 
				
			||||||
train = null
 | 
					train = null
 | 
				
			||||||
dev = null
 | 
					dev = null
 | 
				
			||||||
| 
						 | 
					@ -24,8 +24,11 @@ gpu_allocator = null
 | 
				
			||||||
lang = "{{ lang }}"
 | 
					lang = "{{ lang }}"
 | 
				
			||||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
 | 
					{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
 | 
				
			||||||
{%- set with_accuracy = optimize == "accuracy" -%}
 | 
					{%- set with_accuracy = optimize == "accuracy" -%}
 | 
				
			||||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
 | 
					{# The BOW textcat doesn't need a source of features, so it can omit the
 | 
				
			||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
 | 
					tok2vec/transformer. #}
 | 
				
			||||||
 | 
					{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
 | 
				
			||||||
 | 
					{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
 | 
				
			||||||
 | 
					{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
 | 
				
			||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 | 
					{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
 | 
				
			||||||
{%- else -%}
 | 
					{%- else -%}
 | 
				
			||||||
{%- set full_pipeline = components -%}
 | 
					{%- set full_pipeline = components -%}
 | 
				
			||||||
| 
						 | 
					@ -122,6 +125,30 @@ grad_factor = 1.0
 | 
				
			||||||
@layers = "reduce_mean.v1"
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
{% endif -%}
 | 
					{% endif -%}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{% if "span_finder" in components -%}
 | 
				
			||||||
 | 
					[components.span_finder]
 | 
				
			||||||
 | 
					factory = "span_finder"
 | 
				
			||||||
 | 
					max_length = 25
 | 
				
			||||||
 | 
					min_length = null
 | 
				
			||||||
 | 
					scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
 | 
				
			||||||
 | 
					spans_key = "sc"
 | 
				
			||||||
 | 
					threshold = 0.5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.SpanFinder.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.scorer]
 | 
				
			||||||
 | 
					@layers = "spacy.LinearLogistic.v1"
 | 
				
			||||||
 | 
					nO = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy-transformers.TransformerListener.v1"
 | 
				
			||||||
 | 
					grad_factor = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.tok2vec.pooling]
 | 
				
			||||||
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
 | 
					{% endif -%}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% if "spancat" in components -%}
 | 
					{% if "spancat" in components -%}
 | 
				
			||||||
[components.spancat]
 | 
					[components.spancat]
 | 
				
			||||||
factory = "spancat"
 | 
					factory = "spancat"
 | 
				
			||||||
| 
						 | 
					@ -154,6 +181,36 @@ grad_factor = 1.0
 | 
				
			||||||
sizes = [1,2,3]
 | 
					sizes = [1,2,3]
 | 
				
			||||||
{% endif -%}
 | 
					{% endif -%}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{% if "spancat_singlelabel" in components %}
 | 
				
			||||||
 | 
					[components.spancat_singlelabel]
 | 
				
			||||||
 | 
					factory = "spancat_singlelabel"
 | 
				
			||||||
 | 
					negative_weight = 1.0
 | 
				
			||||||
 | 
					allow_overlap = true
 | 
				
			||||||
 | 
					scorer = {"@scorers":"spacy.spancat_scorer.v1"}
 | 
				
			||||||
 | 
					spans_key = "sc"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.SpanCategorizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.reducer]
 | 
				
			||||||
 | 
					@layers = "spacy.mean_max_reducer.v1"
 | 
				
			||||||
 | 
					hidden_size = 128
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.scorer]
 | 
				
			||||||
 | 
					@layers = "Softmax.v2"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy-transformers.TransformerListener.v1"
 | 
				
			||||||
 | 
					grad_factor = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.tok2vec.pooling]
 | 
				
			||||||
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.suggester]
 | 
				
			||||||
 | 
					@misc = "spacy.ngram_suggester.v1"
 | 
				
			||||||
 | 
					sizes = [1,2,3]
 | 
				
			||||||
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% if "trainable_lemmatizer" in components -%}
 | 
					{% if "trainable_lemmatizer" in components -%}
 | 
				
			||||||
[components.trainable_lemmatizer]
 | 
					[components.trainable_lemmatizer]
 | 
				
			||||||
factory = "trainable_lemmatizer"
 | 
					factory = "trainable_lemmatizer"
 | 
				
			||||||
| 
						 | 
					@ -219,10 +276,16 @@ no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% else -%}
 | 
					{% else -%}
 | 
				
			||||||
[components.textcat.model]
 | 
					[components.textcat.model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatCNN.v2"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
ngram_size = 1
 | 
					nO = null
 | 
				
			||||||
no_output_layer = false
 | 
					
 | 
				
			||||||
 | 
					[components.textcat.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy-transformers.TransformerListener.v1"
 | 
				
			||||||
 | 
					grad_factor = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.textcat.model.tok2vec.pooling]
 | 
				
			||||||
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
{%- endif %}
 | 
					{%- endif %}
 | 
				
			||||||
{%- endif %}
 | 
					{%- endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -250,10 +313,16 @@ no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% else -%}
 | 
					{% else -%}
 | 
				
			||||||
[components.textcat_multilabel.model]
 | 
					[components.textcat_multilabel.model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatCNN.v2"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
ngram_size = 1
 | 
					nO = null
 | 
				
			||||||
no_output_layer = false
 | 
					
 | 
				
			||||||
 | 
					[components.textcat_multilabel.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy-transformers.TransformerListener.v1"
 | 
				
			||||||
 | 
					grad_factor = 1.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.textcat_multilabel.model.tok2vec.pooling]
 | 
				
			||||||
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
{%- endif %}
 | 
					{%- endif %}
 | 
				
			||||||
{%- endif %}
 | 
					{%- endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -284,6 +353,7 @@ maxout_pieces = 3
 | 
				
			||||||
{% if "morphologizer" in components %}
 | 
					{% if "morphologizer" in components %}
 | 
				
			||||||
[components.morphologizer]
 | 
					[components.morphologizer]
 | 
				
			||||||
factory = "morphologizer"
 | 
					factory = "morphologizer"
 | 
				
			||||||
 | 
					label_smoothing = 0.05
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.morphologizer.model]
 | 
					[components.morphologizer.model]
 | 
				
			||||||
@architectures = "spacy.Tagger.v2"
 | 
					@architectures = "spacy.Tagger.v2"
 | 
				
			||||||
| 
						 | 
					@ -297,6 +367,7 @@ width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
{% if "tagger" in components %}
 | 
					{% if "tagger" in components %}
 | 
				
			||||||
[components.tagger]
 | 
					[components.tagger]
 | 
				
			||||||
factory = "tagger"
 | 
					factory = "tagger"
 | 
				
			||||||
 | 
					label_smoothing = 0.05
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.tagger.model]
 | 
					[components.tagger.model]
 | 
				
			||||||
@architectures = "spacy.Tagger.v2"
 | 
					@architectures = "spacy.Tagger.v2"
 | 
				
			||||||
| 
						 | 
					@ -341,6 +412,27 @@ nO = null
 | 
				
			||||||
width = ${components.tok2vec.model.encode.width}
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
{% endif %}
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{% if "span_finder" in components %}
 | 
				
			||||||
 | 
					[components.span_finder]
 | 
				
			||||||
 | 
					factory = "span_finder"
 | 
				
			||||||
 | 
					max_length = 25
 | 
				
			||||||
 | 
					min_length = null
 | 
				
			||||||
 | 
					scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
 | 
				
			||||||
 | 
					spans_key = "sc"
 | 
				
			||||||
 | 
					threshold = 0.5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.SpanFinder.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.scorer]
 | 
				
			||||||
 | 
					@layers = "spacy.LinearLogistic.v1"
 | 
				
			||||||
 | 
					nO = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.span_finder.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy.Tok2VecListener.v1"
 | 
				
			||||||
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% if "spancat" in components %}
 | 
					{% if "spancat" in components %}
 | 
				
			||||||
[components.spancat]
 | 
					[components.spancat]
 | 
				
			||||||
factory = "spancat"
 | 
					factory = "spancat"
 | 
				
			||||||
| 
						 | 
					@ -370,6 +462,33 @@ width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
sizes = [1,2,3]
 | 
					sizes = [1,2,3]
 | 
				
			||||||
{% endif %}
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{% if "spancat_singlelabel" in components %}
 | 
				
			||||||
 | 
					[components.spancat_singlelabel]
 | 
				
			||||||
 | 
					factory = "spancat_singlelabel"
 | 
				
			||||||
 | 
					negative_weight = 1.0
 | 
				
			||||||
 | 
					allow_overlap = true
 | 
				
			||||||
 | 
					scorer = {"@scorers":"spacy.spancat_scorer.v1"}
 | 
				
			||||||
 | 
					spans_key = "sc"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model]
 | 
				
			||||||
 | 
					@architectures = "spacy.SpanCategorizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.reducer]
 | 
				
			||||||
 | 
					@layers = "spacy.mean_max_reducer.v1"
 | 
				
			||||||
 | 
					hidden_size = 128
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.scorer]
 | 
				
			||||||
 | 
					@layers = "Softmax.v2"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.model.tok2vec]
 | 
				
			||||||
 | 
					@architectures = "spacy.Tok2VecListener.v1"
 | 
				
			||||||
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[components.spancat_singlelabel.suggester]
 | 
				
			||||||
 | 
					@misc = "spacy.ngram_suggester.v1"
 | 
				
			||||||
 | 
					sizes = [1,2,3]
 | 
				
			||||||
 | 
					{% endif %}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% if "trainable_lemmatizer" in components -%}
 | 
					{% if "trainable_lemmatizer" in components -%}
 | 
				
			||||||
[components.trainable_lemmatizer]
 | 
					[components.trainable_lemmatizer]
 | 
				
			||||||
factory = "trainable_lemmatizer"
 | 
					factory = "trainable_lemmatizer"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,23 @@
 | 
				
			||||||
from typing import Optional, Dict, Any, Union
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Any, Dict, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
 | 
					 | 
				
			||||||
from ._util import import_code, setup_gpu
 | 
					 | 
				
			||||||
from ..training.loop import train as train_nlp
 | 
					 | 
				
			||||||
from ..training.initialize import init_nlp
 | 
					 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..training.initialize import init_nlp
 | 
				
			||||||
 | 
					from ..training.loop import train as train_nlp
 | 
				
			||||||
 | 
					from ._util import (
 | 
				
			||||||
 | 
					    Arg,
 | 
				
			||||||
 | 
					    Opt,
 | 
				
			||||||
 | 
					    app,
 | 
				
			||||||
 | 
					    import_code,
 | 
				
			||||||
 | 
					    parse_config_overrides,
 | 
				
			||||||
 | 
					    setup_gpu,
 | 
				
			||||||
 | 
					    show_validation_error,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command(
 | 
					@app.command(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,21 @@
 | 
				
			||||||
from typing import Tuple
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import requests
 | 
					 | 
				
			||||||
from wasabi import msg, Printer
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._util import app
 | 
					 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
from ..util import get_package_version, get_installed_models, get_minor_version
 | 
					from ..util import (
 | 
				
			||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
 | 
					    get_installed_models,
 | 
				
			||||||
 | 
					    get_minor_version,
 | 
				
			||||||
 | 
					    get_model_meta,
 | 
				
			||||||
 | 
					    get_package_path,
 | 
				
			||||||
 | 
					    get_package_version,
 | 
				
			||||||
 | 
					    is_compatible_version,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ._util import app
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("validate")
 | 
					@app.command("validate")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
"""Helpers for Python and platform compatibility."""
 | 
					"""Helpers for Python and platform compatibility."""
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.util import copy_array
 | 
					from thinc.util import copy_array
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
 | 
				
			||||||
DOCS: https://spacy.io/api/top-level#displacy
 | 
					DOCS: https://spacy.io/api/top-level#displacy
 | 
				
			||||||
USAGE: https://spacy.io/usage/visualizers
 | 
					USAGE: https://spacy.io/usage/visualizers
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from typing import Any, Callable, Dict, Iterable, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 | 
					 | 
				
			||||||
from ..tokens import Doc, Span
 | 
					 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..util import is_in_jupyter
 | 
					from ..tokens import Doc, Span
 | 
				
			||||||
from ..util import find_available_port
 | 
					from ..util import find_available_port, is_in_jupyter
 | 
				
			||||||
 | 
					from .render import DependencyRenderer, EntityRenderer, SpanRenderer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_html = {}
 | 
					_html = {}
 | 
				
			||||||
RENDER_WRAPPER = None
 | 
					RENDER_WRAPPER = None
 | 
				
			||||||
| 
						 | 
					@ -68,7 +66,7 @@ def render(
 | 
				
			||||||
    if jupyter or (jupyter is None and is_in_jupyter()):
 | 
					    if jupyter or (jupyter is None and is_in_jupyter()):
 | 
				
			||||||
        # return HTML rendered by IPython display()
 | 
					        # return HTML rendered by IPython display()
 | 
				
			||||||
        # See #4840 for details on span wrapper to disable mathjax
 | 
					        # See #4840 for details on span wrapper to disable mathjax
 | 
				
			||||||
        from IPython.core.display import display, HTML
 | 
					        from IPython.core.display import HTML, display
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
 | 
					        return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
 | 
				
			||||||
    return html
 | 
					    return html
 | 
				
			||||||
| 
						 | 
					@ -125,13 +123,17 @@ def app(environ, start_response):
 | 
				
			||||||
    return [res]
 | 
					    return [res]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
 | 
					def parse_deps(
 | 
				
			||||||
 | 
					    orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
 | 
				
			||||||
 | 
					) -> Dict[str, Any]:
 | 
				
			||||||
    """Generate dependency parse in {'words': [], 'arcs': []} format.
 | 
					    """Generate dependency parse in {'words': [], 'arcs': []} format.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    orig_doc (Doc): Document to parse.
 | 
					    orig_doc (Union[Doc, Span]): Document to parse.
 | 
				
			||||||
    options (Dict[str, Any]): Dependency parse specific visualisation options.
 | 
					    options (Dict[str, Any]): Dependency parse specific visualisation options.
 | 
				
			||||||
    RETURNS (dict): Generated dependency parse keyed by words and arcs.
 | 
					    RETURNS (dict): Generated dependency parse keyed by words and arcs.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    if isinstance(orig_doc, Span):
 | 
				
			||||||
 | 
					        orig_doc = orig_doc.as_doc()
 | 
				
			||||||
    doc = Doc(orig_doc.vocab).from_bytes(
 | 
					    doc = Doc(orig_doc.vocab).from_bytes(
 | 
				
			||||||
        orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
 | 
					        orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,28 @@
 | 
				
			||||||
from typing import Any, Dict, List, Optional, Tuple, Union
 | 
					 | 
				
			||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
import itertools
 | 
					from typing import Any, Dict, List, Optional, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..util import escape_html, minify_html, registry
 | 
					from ..util import escape_html, minify_html, registry
 | 
				
			||||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
 | 
					from .templates import (
 | 
				
			||||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
 | 
					    TPL_DEP_ARCS,
 | 
				
			||||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
 | 
					    TPL_DEP_SVG,
 | 
				
			||||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
 | 
					    TPL_DEP_WORDS,
 | 
				
			||||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
 | 
					    TPL_DEP_WORDS_LEMMA,
 | 
				
			||||||
from .templates import TPL_TITLE
 | 
					    TPL_ENT,
 | 
				
			||||||
 | 
					    TPL_ENT_RTL,
 | 
				
			||||||
 | 
					    TPL_ENTS,
 | 
				
			||||||
 | 
					    TPL_FIGURE,
 | 
				
			||||||
 | 
					    TPL_KB_LINK,
 | 
				
			||||||
 | 
					    TPL_PAGE,
 | 
				
			||||||
 | 
					    TPL_SPAN,
 | 
				
			||||||
 | 
					    TPL_SPAN_RTL,
 | 
				
			||||||
 | 
					    TPL_SPAN_SLICE,
 | 
				
			||||||
 | 
					    TPL_SPAN_SLICE_RTL,
 | 
				
			||||||
 | 
					    TPL_SPAN_START,
 | 
				
			||||||
 | 
					    TPL_SPAN_START_RTL,
 | 
				
			||||||
 | 
					    TPL_SPANS,
 | 
				
			||||||
 | 
					    TPL_TITLE,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
DEFAULT_LANG = "en"
 | 
					DEFAULT_LANG = "en"
 | 
				
			||||||
DEFAULT_DIR = "ltr"
 | 
					DEFAULT_DIR = "ltr"
 | 
				
			||||||
| 
						 | 
					@ -204,7 +217,7 @@ class SpanRenderer:
 | 
				
			||||||
                    + (self.offset_step * (len(entities) - 1))
 | 
					                    + (self.offset_step * (len(entities) - 1))
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                markup += self.span_template.format(
 | 
					                markup += self.span_template.format(
 | 
				
			||||||
                    text=token["text"],
 | 
					                    text=escape_html(token["text"]),
 | 
				
			||||||
                    span_slices=slices,
 | 
					                    span_slices=slices,
 | 
				
			||||||
                    span_starts=starts,
 | 
					                    span_starts=starts,
 | 
				
			||||||
                    total_height=total_height,
 | 
					                    total_height=total_height,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
from typing import Literal
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from typing import Literal
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ErrorsWithCodes(type):
 | 
					class ErrorsWithCodes(type):
 | 
				
			||||||
| 
						 | 
					@ -208,6 +208,9 @@ class Warnings(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
 | 
					    W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
 | 
				
			||||||
            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
 | 
					            "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
 | 
				
			||||||
    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
 | 
					    W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
 | 
				
			||||||
 | 
					    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
 | 
				
			||||||
 | 
					            "key attribute for vectors, configure it through Vectors(attr=) or "
 | 
				
			||||||
 | 
					            "'spacy init vectors --attr'")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # v4 warning strings
 | 
					    # v4 warning strings
 | 
				
			||||||
    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
 | 
					    W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
 | 
				
			||||||
| 
						 | 
					@ -546,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
            "during training, make sure to include it in 'annotating components'")
 | 
					            "during training, make sure to include it in 'annotating components'")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # New errors added in v3.x
 | 
					    # New errors added in v3.x
 | 
				
			||||||
 | 
					    E850 = ("The PretrainVectors objective currently only supports default or "
 | 
				
			||||||
 | 
					            "floret vectors, not {mode} vectors.")
 | 
				
			||||||
    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
 | 
					    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
 | 
				
			||||||
            "but found value of '{val}'.")
 | 
					            "but found value of '{val}'.")
 | 
				
			||||||
    E852 = ("The tar file pulled from the remote attempted an unsafe path "
 | 
					    E852 = ("The tar file pulled from the remote attempted an unsafe path "
 | 
				
			||||||
| 
						 | 
					@ -954,6 +959,14 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
             "with `displacy.serve(doc, port=port)`")
 | 
					             "with `displacy.serve(doc, port=port)`")
 | 
				
			||||||
    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
 | 
					    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
 | 
				
			||||||
             "or use `auto_select_port=True` to pick an available port automatically.")
 | 
					             "or use `auto_select_port=True` to pick an available port automatically.")
 | 
				
			||||||
 | 
					    E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
 | 
				
			||||||
 | 
					    E1052 = ("Unable to copy spans: the character offsets for the span at "
 | 
				
			||||||
 | 
					             "index {i} in the span group do not align with the tokenization "
 | 
				
			||||||
 | 
					             "in the target doc.")
 | 
				
			||||||
 | 
					    E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
 | 
				
			||||||
 | 
					             " 'min_length': {min_length}, 'max_length': {max_length}")
 | 
				
			||||||
 | 
					    E1054 = ("The text, including whitespace, must match between reference and "
 | 
				
			||||||
 | 
					             "predicted docs when training {component}.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # v4 error strings
 | 
					    # v4 error strings
 | 
				
			||||||
    E4000 = ("Expected a Doc as input, but got: '{type}'")
 | 
					    E4000 = ("Expected a Doc as input, but got: '{type}'")
 | 
				
			||||||
| 
						 | 
					@ -968,6 +981,7 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
 | 
					    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
 | 
				
			||||||
             "{existing_value}.")
 | 
					             "{existing_value}.")
 | 
				
			||||||
    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 | 
					    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 | 
				
			||||||
 | 
					    E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 | 
					RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .errors import Warnings
 | 
					from .errors import Warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
 | 
					from .candidate import Candidate, InMemoryCandidate
 | 
				
			||||||
from .kb import KnowledgeBase
 | 
					from .kb import KnowledgeBase
 | 
				
			||||||
from .kb_in_memory import InMemoryLookupKB
 | 
					from .kb_in_memory import InMemoryLookupKB
 | 
				
			||||||
from .candidate import Candidate, InMemoryCandidate
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
 | 
					__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,8 @@
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
from .kb_in_memory cimport InMemoryLookupKB
 | 
					
 | 
				
			||||||
from ..typedefs cimport hash_t
 | 
					from ..typedefs cimport hash_t
 | 
				
			||||||
 | 
					from .kb_in_memory cimport InMemoryLookupKB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
| 
						 | 
					@ -9,7 +11,7 @@ cdef class Candidate:
 | 
				
			||||||
cdef class InMemoryCandidate(Candidate):
 | 
					cdef class InMemoryCandidate(Candidate):
 | 
				
			||||||
    cdef readonly hash_t _entity_hash
 | 
					    cdef readonly hash_t _entity_hash
 | 
				
			||||||
    cdef readonly hash_t _alias_hash
 | 
					    cdef readonly hash_t _alias_hash
 | 
				
			||||||
    cpdef vector[float] _entity_vector
 | 
					    cdef vector[float] _entity_vector
 | 
				
			||||||
    cdef float _prior_prob
 | 
					    cdef float _prior_prob
 | 
				
			||||||
    cdef readonly InMemoryLookupKB _kb
 | 
					    cdef readonly InMemoryLookupKB _kb
 | 
				
			||||||
    cdef float _entity_freq
 | 
					    cdef float _entity_freq
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,10 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .kb_in_memory cimport InMemoryLookupKB
 | 
					from .kb_in_memory cimport InMemoryLookupKB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
    """A `Candidate` object refers to a textual mention that may or may not be resolved
 | 
					    """A `Candidate` object refers to a textual mention that may or may not be resolved
 | 
				
			||||||
    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
 | 
					    to a specific entity from a Knowledge Base. This will be used as input for the entity linking
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,8 +2,10 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from libc.stdint cimport int64_t
 | 
					from libc.stdint cimport int64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cdef readonly Vocab vocab
 | 
					    cdef readonly Vocab vocab
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,18 +1,20 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Iterable, Tuple, Union, Iterator
 | 
					from typing import Iterable, Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .candidate import Candidate
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..tokens import Span, SpanGroup
 | 
					from ..tokens import Span, SpanGroup
 | 
				
			||||||
from ..util import SimpleFrozenList
 | 
					from ..util import SimpleFrozenList
 | 
				
			||||||
from ..errors import Errors
 | 
					from .candidate import Candidate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class KnowledgeBase:
 | 
					cdef class KnowledgeBase:
 | 
				
			||||||
    """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
 | 
					    """A `KnowledgeBase` instance stores unique identifiers for entities and
 | 
				
			||||||
    to support entity linking of named entities to real-world concepts.
 | 
					    their textual aliases, to support entity linking of named entities to
 | 
				
			||||||
 | 
					    real-world concepts.
 | 
				
			||||||
    This is an abstract class and requires its operations to be implemented.
 | 
					    This is an abstract class and requires its operations to be implemented.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/kb
 | 
					    DOCS: https://spacy.io/api/kb
 | 
				
			||||||
| 
						 | 
					@ -40,7 +42,9 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates.
 | 
					        RETURNS (Iterator[Iterable[Iterable[Candidate]]]): Identified candidates.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="get_candidates", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
 | 
					    def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
 | 
				
			||||||
| 
						 | 
					@ -58,7 +62,9 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        RETURNS (Iterable[float]): Vector for specified entity.
 | 
					        RETURNS (Iterable[float]): Vector for specified entity.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="get_vector", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self, **kwargs) -> bytes:
 | 
					    def to_bytes(self, **kwargs) -> bytes:
 | 
				
			||||||
| 
						 | 
					@ -66,7 +72,9 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        RETURNS (bytes): Current state as binary string.
 | 
					        RETURNS (bytes): Current state as binary string.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="to_bytes", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
 | 
					    def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
 | 
				
			||||||
| 
						 | 
					@ -75,27 +83,37 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        exclude (Tuple[str]): Properties to exclude when restoring KB.
 | 
					        exclude (Tuple[str]): Properties to exclude when restoring KB.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="from_bytes", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
 | 
					    def to_disk(
 | 
				
			||||||
 | 
					            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
 | 
				
			||||||
 | 
					    ) -> None:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Write KnowledgeBase content to disk.
 | 
					        Write KnowledgeBase content to disk.
 | 
				
			||||||
        path (Union[str, Path]): Target file path.
 | 
					        path (Union[str, Path]): Target file path.
 | 
				
			||||||
        exclude (Iterable[str]): List of components to exclude.
 | 
					        exclude (Iterable[str]): List of components to exclude.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="to_disk", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
 | 
					    def from_disk(
 | 
				
			||||||
 | 
					            self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
 | 
				
			||||||
 | 
					    ) -> None:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Load KnowledgeBase content from disk.
 | 
					        Load KnowledgeBase content from disk.
 | 
				
			||||||
        path (Union[str, Path]): Target file path.
 | 
					        path (Union[str, Path]): Target file path.
 | 
				
			||||||
        exclude (Iterable[str]): List of components to exclude.
 | 
					        exclude (Iterable[str]): List of components to exclude.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError(
 | 
					        raise NotImplementedError(
 | 
				
			||||||
            Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
 | 
					            Errors.E1045.format(
 | 
				
			||||||
 | 
					                parent="KnowledgeBase", method="from_disk", name=self.__name__
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,11 @@
 | 
				
			||||||
"""Knowledge-base for entity or concept linking."""
 | 
					"""Knowledge-base for entity or concept linking."""
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
from libc.stdio cimport FILE
 | 
					from libc.stdio cimport FILE
 | 
				
			||||||
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..structs cimport AliasC, KBEntryC
 | 
				
			||||||
from ..typedefs cimport hash_t
 | 
					from ..typedefs cimport hash_t
 | 
				
			||||||
from ..structs cimport KBEntryC, AliasC
 | 
					 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef vector[KBEntryC] entry_vec
 | 
					ctypedef vector[KBEntryC] entry_vec
 | 
				
			||||||
| 
						 | 
					@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
    # optional data, we can let users configure a DB as the backend for this.
 | 
					    # optional data, we can let users configure a DB as the backend for this.
 | 
				
			||||||
    cdef object _features_table
 | 
					    cdef object _features_table
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
 | 
					    cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
 | 
				
			||||||
        """Add an entity vector to the vectors table."""
 | 
					        """Add an entity vector to the vectors table."""
 | 
				
			||||||
        cdef int64_t new_index = self._vectors_table.size()
 | 
					        cdef int64_t new_index = self._vectors_table.size()
 | 
				
			||||||
        self._vectors_table.push_back(entity_vector)
 | 
					        self._vectors_table.push_back(entity_vector)
 | 
				
			||||||
        return new_index
 | 
					        return new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef inline int64_t c_add_entity(
 | 
				
			||||||
    cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
 | 
					        self,
 | 
				
			||||||
                                     int32_t vector_index, int feats_row) nogil:
 | 
					        hash_t entity_hash,
 | 
				
			||||||
 | 
					        float freq,
 | 
				
			||||||
 | 
					        int32_t vector_index,
 | 
				
			||||||
 | 
					        int feats_row
 | 
				
			||||||
 | 
					    ) nogil:
 | 
				
			||||||
        """Add an entry to the vector of entries.
 | 
					        """Add an entry to the vector of entries.
 | 
				
			||||||
        After calling this method, make sure to update also the _entry_index using the return value"""
 | 
					        After calling this method, make sure to update also the _entry_index
 | 
				
			||||||
 | 
					        using the return value"""
 | 
				
			||||||
        # This is what we'll map the entity hash key to. It's where the entry will sit
 | 
					        # This is what we'll map the entity hash key to. It's where the entry will sit
 | 
				
			||||||
        # in the vector of entries, so we can get it later.
 | 
					        # in the vector of entries, so we can get it later.
 | 
				
			||||||
        cdef int64_t new_index = self._entries.size()
 | 
					        cdef int64_t new_index = self._entries.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
 | 
					        # Avoid struct initializer to enable nogil, cf.
 | 
				
			||||||
 | 
					        # https://github.com/cython/cython/issues/1642
 | 
				
			||||||
        cdef KBEntryC entry
 | 
					        cdef KBEntryC entry
 | 
				
			||||||
        entry.entity_hash = entity_hash
 | 
					        entry.entity_hash = entity_hash
 | 
				
			||||||
        entry.vector_index = vector_index
 | 
					        entry.vector_index = vector_index
 | 
				
			||||||
| 
						 | 
					@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        self._entries.push_back(entry)
 | 
					        self._entries.push_back(entry)
 | 
				
			||||||
        return new_index
 | 
					        return new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
 | 
					    cdef inline int64_t c_add_aliases(
 | 
				
			||||||
        """Connect a mention to a list of potential entities with their prior probabilities .
 | 
					        self,
 | 
				
			||||||
        After calling this method, make sure to update also the _alias_index using the return value"""
 | 
					        hash_t alias_hash,
 | 
				
			||||||
        # This is what we'll map the alias hash key to. It's where the alias will be defined
 | 
					        vector[int64_t] entry_indices,
 | 
				
			||||||
        # in the vector of aliases.
 | 
					        vector[float] probs
 | 
				
			||||||
 | 
					    ) nogil:
 | 
				
			||||||
 | 
					        """Connect a mention to a list of potential entities with their prior
 | 
				
			||||||
 | 
					        probabilities. After calling this method, make sure to update also the
 | 
				
			||||||
 | 
					        _alias_index using the return value"""
 | 
				
			||||||
 | 
					        # This is what we'll map the alias hash key to. It's where the alias will be
 | 
				
			||||||
 | 
					        # defined in the vector of aliases.
 | 
				
			||||||
        cdef int64_t new_index = self._aliases_table.size()
 | 
					        cdef int64_t new_index = self._aliases_table.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Avoid struct initializer to enable nogil
 | 
					        # Avoid struct initializer to enable nogil
 | 
				
			||||||
| 
						 | 
					@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
 | 
					    cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Initializing the vectors and making sure the first element of each vector is a dummy,
 | 
					        Initializing the vectors and making sure the first element of each vector is a
 | 
				
			||||||
        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
 | 
					        dummy, because the PreshMap maps pointing to indices in these vectors can not
 | 
				
			||||||
 | 
					        contain 0 as value.
 | 
				
			||||||
        cf. https://github.com/explosion/preshed/issues/17
 | 
					        cf. https://github.com/explosion/preshed/issues/17
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int32_t dummy_value = 0
 | 
					        cdef int32_t dummy_value = 0
 | 
				
			||||||
| 
						 | 
					@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
cdef class Writer:
 | 
					cdef class Writer:
 | 
				
			||||||
    cdef FILE* _fp
 | 
					    cdef FILE* _fp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
 | 
					    cdef int write_header(
 | 
				
			||||||
 | 
					        self, int64_t nr_entries, int64_t entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int write_vector_element(self, float element) except -1
 | 
					    cdef int write_vector_element(self, float element) except -1
 | 
				
			||||||
    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
 | 
					    cdef int write_entry(
 | 
				
			||||||
 | 
					        self, hash_t entry_hash, float entry_freq, int32_t vector_index
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_alias_length(self, int64_t alias_length) except -1
 | 
					    cdef int write_alias_length(self, int64_t alias_length) except -1
 | 
				
			||||||
    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
 | 
					    cdef int write_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t alias_hash, int64_t candidate_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int write_alias(self, int64_t entry_index, float prob) except -1
 | 
					    cdef int write_alias(self, int64_t entry_index, float prob) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _write(self, void* value, size_t size) except -1
 | 
					    cdef int _write(self, void* value, size_t size) except -1
 | 
				
			||||||
| 
						 | 
					@ -143,12 +161,18 @@ cdef class Writer:
 | 
				
			||||||
cdef class Reader:
 | 
					cdef class Reader:
 | 
				
			||||||
    cdef FILE* _fp
 | 
					    cdef FILE* _fp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
 | 
					    cdef int read_header(
 | 
				
			||||||
 | 
					        self, int64_t* nr_entries, int64_t* entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int read_vector_element(self, float* element) except -1
 | 
					    cdef int read_vector_element(self, float* element) except -1
 | 
				
			||||||
    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
 | 
					    cdef int read_entry(
 | 
				
			||||||
 | 
					        self, hash_t* entity_hash, float* freq, int32_t* vector_index
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_alias_length(self, int64_t* alias_length) except -1
 | 
					    cdef int read_alias_length(self, int64_t* alias_length) except -1
 | 
				
			||||||
    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
 | 
					    cdef int read_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t* alias_hash, int64_t* candidate_length
 | 
				
			||||||
 | 
					    ) except -1
 | 
				
			||||||
    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
					    cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _read(self, void* value, size_t size) except -1
 | 
					    cdef int _read(self, void* value, size_t size) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,29 +1,35 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True, profile=True
 | 
				
			||||||
from typing import Iterable, Callable, Dict, Any, Union, Iterator
 | 
					from typing import Any, Callable, Dict, Iterable, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					
 | 
				
			||||||
from cpython.exc cimport PyErr_SetFromErrno
 | 
					from cpython.exc cimport PyErr_SetFromErrno
 | 
				
			||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
 | 
					 | 
				
			||||||
from libc.stdint cimport int32_t, int64_t
 | 
					from libc.stdint cimport int32_t, int64_t
 | 
				
			||||||
 | 
					from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..tokens import Span, SpanGroup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens import SpanGroup
 | 
					 | 
				
			||||||
from ..typedefs cimport hash_t
 | 
					from ..typedefs cimport hash_t
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..util import SimpleFrozenList, ensure_path
 | 
					from ..util import SimpleFrozenList, ensure_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
from .kb cimport KnowledgeBase
 | 
					from .kb cimport KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .candidate import InMemoryCandidate
 | 
					from .candidate import InMemoryCandidate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class InMemoryLookupKB(KnowledgeBase):
 | 
					cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
    """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
 | 
					    """An `InMemoryLookupKB` instance stores unique identifiers for entities
 | 
				
			||||||
    to support entity linking of named entities to real-world concepts.
 | 
					    and their textual aliases, to support entity linking of named entities to
 | 
				
			||||||
 | 
					    real-world concepts.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    DOCS: https://spacy.io/api/inmemorylookupkb
 | 
					    DOCS: https://spacy.io/api/inmemorylookupkb
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -66,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_entity(self, str entity, float freq, vector[float] entity_vector):
 | 
					    def add_entity(self, str entity, float freq, vector[float] entity_vector):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Add an entity to the KB, optionally specifying its log probability based on corpus frequency
 | 
					        Add an entity to the KB, optionally specifying its log probability
 | 
				
			||||||
 | 
					        based on corpus frequency.
 | 
				
			||||||
        Return the hash of the entity ID/name at the end.
 | 
					        Return the hash of the entity ID/name at the end.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 | 
					        cdef hash_t entity_hash = self.vocab.strings.add(entity)
 | 
				
			||||||
| 
						 | 
					@ -78,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Raise an error if the provided entity vector is not of the correct length
 | 
					        # Raise an error if the provided entity vector is not of the correct length
 | 
				
			||||||
        if len(entity_vector) != self.entity_vector_length:
 | 
					        if len(entity_vector) != self.entity_vector_length:
 | 
				
			||||||
            raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                Errors.E141.format(
 | 
				
			||||||
 | 
					                    found=len(entity_vector), required=self.entity_vector_length
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        vector_index = self.c_add_vector(entity_vector=entity_vector)
 | 
					        vector_index = self.c_add_vector(entity_vector=entity_vector)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_index = self.c_add_entity(entity_hash=entity_hash,
 | 
					        new_index = self.c_add_entity(
 | 
				
			||||||
                                      freq=freq,
 | 
					            entity_hash=entity_hash,
 | 
				
			||||||
                                      vector_index=vector_index,
 | 
					            freq=freq,
 | 
				
			||||||
                                      feats_row=-1)  # Features table currently not implemented
 | 
					            vector_index=vector_index,
 | 
				
			||||||
 | 
					            feats_row=-1
 | 
				
			||||||
 | 
					        )  # Features table currently not implemented
 | 
				
			||||||
        self._entry_index[entity_hash] = new_index
 | 
					        self._entry_index[entity_hash] = new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return entity_hash
 | 
					        return entity_hash
 | 
				
			||||||
| 
						 | 
					@ -110,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                entity_vector = vector_list[i]
 | 
					                entity_vector = vector_list[i]
 | 
				
			||||||
                if len(entity_vector) != self.entity_vector_length:
 | 
					                if len(entity_vector) != self.entity_vector_length:
 | 
				
			||||||
                    raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
 | 
					                    raise ValueError(
 | 
				
			||||||
 | 
					                        Errors.E141.format(
 | 
				
			||||||
 | 
					                            found=len(entity_vector),
 | 
				
			||||||
 | 
					                            required=self.entity_vector_length
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                entry.entity_hash = entity_hash
 | 
					                entry.entity_hash = entity_hash
 | 
				
			||||||
                entry.freq = freq_list[i]
 | 
					                entry.freq = freq_list[i]
 | 
				
			||||||
| 
						 | 
					@ -144,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        previous_alias_nr = self.get_size_aliases()
 | 
					        previous_alias_nr = self.get_size_aliases()
 | 
				
			||||||
        # Throw an error if the length of entities and probabilities are not the same
 | 
					        # Throw an error if the length of entities and probabilities are not the same
 | 
				
			||||||
        if not len(entities) == len(probabilities):
 | 
					        if not len(entities) == len(probabilities):
 | 
				
			||||||
            raise ValueError(Errors.E132.format(alias=alias,
 | 
					            raise ValueError(
 | 
				
			||||||
                                                entities_length=len(entities),
 | 
					                Errors.E132.format(
 | 
				
			||||||
                                                probabilities_length=len(probabilities)))
 | 
					                    alias=alias,
 | 
				
			||||||
 | 
					                    entities_length=len(entities),
 | 
				
			||||||
 | 
					                    probabilities_length=len(probabilities))
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
 | 
					        # Throw an error if the probabilities sum up to more than 1 (allow for
 | 
				
			||||||
 | 
					        # some rounding errors)
 | 
				
			||||||
        prob_sum = sum(probabilities)
 | 
					        prob_sum = sum(probabilities)
 | 
				
			||||||
        if prob_sum > 1.00001:
 | 
					        if prob_sum > 1.00001:
 | 
				
			||||||
            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 | 
					            raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
 | 
				
			||||||
| 
						 | 
					@ -165,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for entity, prob in zip(entities, probabilities):
 | 
					        for entity, prob in zip(entities, probabilities):
 | 
				
			||||||
            entity_hash = self.vocab.strings[entity]
 | 
					            entity_hash = self.vocab.strings[entity]
 | 
				
			||||||
            if not entity_hash in self._entry_index:
 | 
					            if entity_hash not in self._entry_index:
 | 
				
			||||||
                raise ValueError(Errors.E134.format(entity=entity))
 | 
					                raise ValueError(Errors.E134.format(entity=entity))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
					            entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
				
			||||||
            entry_indices.push_back(int(entry_index))
 | 
					            entry_indices.push_back(int(entry_index))
 | 
				
			||||||
            probs.push_back(float(prob))
 | 
					            probs.push_back(float(prob))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
 | 
					        new_index = self.c_add_aliases(
 | 
				
			||||||
 | 
					            alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        self._alias_index[alias_hash] = new_index
 | 
					        self._alias_index[alias_hash] = new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if previous_alias_nr + 1 != self.get_size_aliases():
 | 
					        if previous_alias_nr + 1 != self.get_size_aliases():
 | 
				
			||||||
            raise RuntimeError(Errors.E891.format(alias=alias))
 | 
					            raise RuntimeError(Errors.E891.format(alias=alias))
 | 
				
			||||||
        return alias_hash
 | 
					        return alias_hash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
 | 
					    def append_alias(
 | 
				
			||||||
 | 
					        self, str alias, str entity, float prior_prob, ignore_warnings=False
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        For an alias already existing in the KB, extend its potential entities with one more.
 | 
					        For an alias already existing in the KB, extend its potential entities
 | 
				
			||||||
 | 
					        with one more.
 | 
				
			||||||
        Throw a warning if either the alias or the entity is unknown,
 | 
					        Throw a warning if either the alias or the entity is unknown,
 | 
				
			||||||
        or when the combination is already previously recorded.
 | 
					        or when the combination is already previously recorded.
 | 
				
			||||||
        Throw an error if this entity+prior prob would exceed the sum of 1.
 | 
					        Throw an error if this entity+prior prob would exceed the sum of 1.
 | 
				
			||||||
        For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
 | 
					        For efficiency, it's best to use the method `add_alias` as much as
 | 
				
			||||||
 | 
					        possible instead of this one.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        # Check if the alias exists in the KB
 | 
					        # Check if the alias exists in the KB
 | 
				
			||||||
        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
					        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
				
			||||||
        if not alias_hash in self._alias_index:
 | 
					        if alias_hash not in self._alias_index:
 | 
				
			||||||
            raise ValueError(Errors.E176.format(alias=alias))
 | 
					            raise ValueError(Errors.E176.format(alias=alias))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Check if the entity exists in the KB
 | 
					        # Check if the entity exists in the KB
 | 
				
			||||||
        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
					        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
				
			||||||
        if not entity_hash in self._entry_index:
 | 
					        if entity_hash not in self._entry_index:
 | 
				
			||||||
            raise ValueError(Errors.E134.format(entity=entity))
 | 
					            raise ValueError(Errors.E134.format(entity=entity))
 | 
				
			||||||
        entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
					        entry_index = <int64_t>self._entry_index.get(entity_hash)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Throw an error if the prior probabilities (including the new one) sum up to more than 1
 | 
					        # Throw an error if the prior probabilities (including the new one)
 | 
				
			||||||
 | 
					        # sum up to more than 1
 | 
				
			||||||
        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
					        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
        current_sum = sum([p for p in alias_entry.probs])
 | 
					        current_sum = sum([p for p in alias_entry.probs])
 | 
				
			||||||
| 
						 | 
					@ -232,12 +261,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
 | 
					    def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Return candidate entities for an alias. Each candidate defines the entity, the original alias,
 | 
					        Return candidate entities for an alias. Each candidate defines the
 | 
				
			||||||
        and the prior probability of that alias resolving to that entity.
 | 
					        entity, the original alias, and the prior probability of that alias
 | 
				
			||||||
 | 
					        resolving to that entity.
 | 
				
			||||||
        If the alias is not known in the KB, and empty list is returned.
 | 
					        If the alias is not known in the KB, and empty list is returned.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
					        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
				
			||||||
        if not alias_hash in self._alias_index:
 | 
					        if alias_hash not in self._alias_index:
 | 
				
			||||||
            return []
 | 
					            return []
 | 
				
			||||||
        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
					        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
| 
						 | 
					@ -266,8 +296,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        return self._vectors_table[self._entries[entry_index].vector_index]
 | 
					        return self._vectors_table[self._entries[entry_index].vector_index]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_prior_prob(self, str entity, str alias):
 | 
					    def get_prior_prob(self, str entity, str alias):
 | 
				
			||||||
        """ Return the prior probability of a given alias being linked to a given entity,
 | 
					        """ Return the prior probability of a given alias being linked to a
 | 
				
			||||||
        or return 0.0 when this combination is not known in the knowledge base"""
 | 
					        given entity, or return 0.0 when this combination is not known in the
 | 
				
			||||||
 | 
					        knowledge base."""
 | 
				
			||||||
        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
					        cdef hash_t alias_hash = self.vocab.strings[alias]
 | 
				
			||||||
        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
					        cdef hash_t entity_hash = self.vocab.strings[entity]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -278,7 +309,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        entry_index = self._entry_index[entity_hash]
 | 
					        entry_index = self._entry_index[entity_hash]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
        for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
 | 
					        for (entry_index, prior_prob) in zip(
 | 
				
			||||||
 | 
					            alias_entry.entry_indices, alias_entry.probs
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            if self._entries[entry_index].entity_hash == entity_hash:
 | 
					            if self._entries[entry_index].entity_hash == entity_hash:
 | 
				
			||||||
                return prior_prob
 | 
					                return prior_prob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -291,13 +324,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        """Serialize the current state to a binary string.
 | 
					        """Serialize the current state to a binary string.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        def serialize_header():
 | 
					        def serialize_header():
 | 
				
			||||||
            header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
 | 
					            header = (
 | 
				
			||||||
 | 
					                self.get_size_entities(),
 | 
				
			||||||
 | 
					                self.get_size_aliases(),
 | 
				
			||||||
 | 
					                self.entity_vector_length
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
            return srsly.json_dumps(header)
 | 
					            return srsly.json_dumps(header)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def serialize_entries():
 | 
					        def serialize_entries():
 | 
				
			||||||
            i = 1
 | 
					            i = 1
 | 
				
			||||||
            tuples = []
 | 
					            tuples = []
 | 
				
			||||||
            for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
 | 
					            for entry_hash, entry_index in sorted(
 | 
				
			||||||
 | 
					                self._entry_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
                entry = self._entries[entry_index]
 | 
					                entry = self._entries[entry_index]
 | 
				
			||||||
                assert entry.entity_hash == entry_hash
 | 
					                assert entry.entity_hash == entry_hash
 | 
				
			||||||
                assert entry_index == i
 | 
					                assert entry_index == i
 | 
				
			||||||
| 
						 | 
					@ -310,7 +349,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
            headers = []
 | 
					            headers = []
 | 
				
			||||||
            indices_lists = []
 | 
					            indices_lists = []
 | 
				
			||||||
            probs_lists = []
 | 
					            probs_lists = []
 | 
				
			||||||
            for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
 | 
					            for alias_hash, alias_index in sorted(
 | 
				
			||||||
 | 
					                self._alias_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
                alias = self._aliases_table[alias_index]
 | 
					                alias = self._aliases_table[alias_index]
 | 
				
			||||||
                assert alias_index == i
 | 
					                assert alias_index == i
 | 
				
			||||||
                candidate_length = len(alias.entry_indices)
 | 
					                candidate_length = len(alias.entry_indices)
 | 
				
			||||||
| 
						 | 
					@ -368,7 +409,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
            indices = srsly.json_loads(all_data[1])
 | 
					            indices = srsly.json_loads(all_data[1])
 | 
				
			||||||
            probs = srsly.json_loads(all_data[2])
 | 
					            probs = srsly.json_loads(all_data[2])
 | 
				
			||||||
            for header, indices, probs in zip(headers, indices, probs):
 | 
					            for header, indices, probs in zip(headers, indices, probs):
 | 
				
			||||||
                alias_hash, candidate_length = header
 | 
					                alias_hash, _candidate_length = header
 | 
				
			||||||
                alias.entry_indices = indices
 | 
					                alias.entry_indices = indices
 | 
				
			||||||
                alias.probs = probs
 | 
					                alias.probs = probs
 | 
				
			||||||
                self._aliases_table[i] = alias
 | 
					                self._aliases_table[i] = alias
 | 
				
			||||||
| 
						 | 
					@ -417,10 +458,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
                writer.write_vector_element(element)
 | 
					                writer.write_vector_element(element)
 | 
				
			||||||
            i = i+1
 | 
					            i = i+1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # dumping the entry records in the order in which they are in the _entries vector.
 | 
					        # dumping the entry records in the order in which they are in the
 | 
				
			||||||
        # index 0 is a dummy object not stored in the _entry_index and can be ignored.
 | 
					        # _entries vector.
 | 
				
			||||||
 | 
					        # index 0 is a dummy object not stored in the _entry_index and can
 | 
				
			||||||
 | 
					        # be ignored.
 | 
				
			||||||
        i = 1
 | 
					        i = 1
 | 
				
			||||||
        for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
 | 
					        for entry_hash, entry_index in sorted(
 | 
				
			||||||
 | 
					            self._entry_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            entry = self._entries[entry_index]
 | 
					            entry = self._entries[entry_index]
 | 
				
			||||||
            assert entry.entity_hash == entry_hash
 | 
					            assert entry.entity_hash == entry_hash
 | 
				
			||||||
            assert entry_index == i
 | 
					            assert entry_index == i
 | 
				
			||||||
| 
						 | 
					@ -432,7 +477,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        # dumping the aliases in the order in which they are in the _alias_index vector.
 | 
					        # dumping the aliases in the order in which they are in the _alias_index vector.
 | 
				
			||||||
        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
 | 
					        # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
 | 
				
			||||||
        i = 1
 | 
					        i = 1
 | 
				
			||||||
        for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
 | 
					        for alias_hash, alias_index in sorted(
 | 
				
			||||||
 | 
					                self._alias_index.items(), key=lambda x: x[1]
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            alias = self._aliases_table[alias_index]
 | 
					            alias = self._aliases_table[alias_index]
 | 
				
			||||||
            assert alias_index == i
 | 
					            assert alias_index == i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -538,7 +585,8 @@ cdef class Writer:
 | 
				
			||||||
    def __init__(self, path):
 | 
					    def __init__(self, path):
 | 
				
			||||||
        assert isinstance(path, Path)
 | 
					        assert isinstance(path, Path)
 | 
				
			||||||
        content = bytes(path)
 | 
					        content = bytes(path)
 | 
				
			||||||
        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
 | 
					        cdef bytes bytes_loc = content.encode('utf8') \
 | 
				
			||||||
 | 
					            if type(content) == str else content
 | 
				
			||||||
        self._fp = fopen(<char*>bytes_loc, 'wb')
 | 
					        self._fp = fopen(<char*>bytes_loc, 'wb')
 | 
				
			||||||
        if not self._fp:
 | 
					        if not self._fp:
 | 
				
			||||||
            raise IOError(Errors.E146.format(path=path))
 | 
					            raise IOError(Errors.E146.format(path=path))
 | 
				
			||||||
| 
						 | 
					@ -548,14 +596,18 @@ cdef class Writer:
 | 
				
			||||||
        cdef size_t status = fclose(self._fp)
 | 
					        cdef size_t status = fclose(self._fp)
 | 
				
			||||||
        assert status == 0
 | 
					        assert status == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
 | 
					    cdef int write_header(
 | 
				
			||||||
 | 
					        self, int64_t nr_entries, int64_t entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        self._write(&nr_entries, sizeof(nr_entries))
 | 
					        self._write(&nr_entries, sizeof(nr_entries))
 | 
				
			||||||
        self._write(&entity_vector_length, sizeof(entity_vector_length))
 | 
					        self._write(&entity_vector_length, sizeof(entity_vector_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_vector_element(self, float element) except -1:
 | 
					    cdef int write_vector_element(self, float element) except -1:
 | 
				
			||||||
        self._write(&element, sizeof(element))
 | 
					        self._write(&element, sizeof(element))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
 | 
					    cdef int write_entry(
 | 
				
			||||||
 | 
					        self, hash_t entry_hash, float entry_freq, int32_t vector_index
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        self._write(&entry_hash, sizeof(entry_hash))
 | 
					        self._write(&entry_hash, sizeof(entry_hash))
 | 
				
			||||||
        self._write(&entry_freq, sizeof(entry_freq))
 | 
					        self._write(&entry_freq, sizeof(entry_freq))
 | 
				
			||||||
        self._write(&vector_index, sizeof(vector_index))
 | 
					        self._write(&vector_index, sizeof(vector_index))
 | 
				
			||||||
| 
						 | 
					@ -564,7 +616,9 @@ cdef class Writer:
 | 
				
			||||||
    cdef int write_alias_length(self, int64_t alias_length) except -1:
 | 
					    cdef int write_alias_length(self, int64_t alias_length) except -1:
 | 
				
			||||||
        self._write(&alias_length, sizeof(alias_length))
 | 
					        self._write(&alias_length, sizeof(alias_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
 | 
					    cdef int write_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t alias_hash, int64_t candidate_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        self._write(&alias_hash, sizeof(alias_hash))
 | 
					        self._write(&alias_hash, sizeof(alias_hash))
 | 
				
			||||||
        self._write(&candidate_length, sizeof(candidate_length))
 | 
					        self._write(&candidate_length, sizeof(candidate_length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -580,16 +634,19 @@ cdef class Writer:
 | 
				
			||||||
cdef class Reader:
 | 
					cdef class Reader:
 | 
				
			||||||
    def __init__(self, path):
 | 
					    def __init__(self, path):
 | 
				
			||||||
        content = bytes(path)
 | 
					        content = bytes(path)
 | 
				
			||||||
        cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
 | 
					        cdef bytes bytes_loc = content.encode('utf8') \
 | 
				
			||||||
 | 
					            if type(content) == str else content
 | 
				
			||||||
        self._fp = fopen(<char*>bytes_loc, 'rb')
 | 
					        self._fp = fopen(<char*>bytes_loc, 'rb')
 | 
				
			||||||
        if not self._fp:
 | 
					        if not self._fp:
 | 
				
			||||||
            PyErr_SetFromErrno(IOError)
 | 
					            PyErr_SetFromErrno(IOError)
 | 
				
			||||||
        status = fseek(self._fp, 0, 0)  # this can be 0 if there is no header
 | 
					        fseek(self._fp, 0, 0)  # this can be 0 if there is no header
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __dealloc__(self):
 | 
					    def __dealloc__(self):
 | 
				
			||||||
        fclose(self._fp)
 | 
					        fclose(self._fp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
 | 
					    cdef int read_header(
 | 
				
			||||||
 | 
					        self, int64_t* nr_entries, int64_t* entity_vector_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        status = self._read(nr_entries, sizeof(int64_t))
 | 
					        status = self._read(nr_entries, sizeof(int64_t))
 | 
				
			||||||
        if status < 1:
 | 
					        if status < 1:
 | 
				
			||||||
            if feof(self._fp):
 | 
					            if feof(self._fp):
 | 
				
			||||||
| 
						 | 
					@ -609,7 +666,9 @@ cdef class Reader:
 | 
				
			||||||
                return 0  # end of file
 | 
					                return 0  # end of file
 | 
				
			||||||
            raise IOError(Errors.E145.format(param="vector element"))
 | 
					            raise IOError(Errors.E145.format(param="vector element"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
 | 
					    cdef int read_entry(
 | 
				
			||||||
 | 
					        self, hash_t* entity_hash, float* freq, int32_t* vector_index
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        status = self._read(entity_hash, sizeof(hash_t))
 | 
					        status = self._read(entity_hash, sizeof(hash_t))
 | 
				
			||||||
        if status < 1:
 | 
					        if status < 1:
 | 
				
			||||||
            if feof(self._fp):
 | 
					            if feof(self._fp):
 | 
				
			||||||
| 
						 | 
					@ -640,7 +699,9 @@ cdef class Reader:
 | 
				
			||||||
                return 0  # end of file
 | 
					                return 0  # end of file
 | 
				
			||||||
            raise IOError(Errors.E145.format(param="alias length"))
 | 
					            raise IOError(Errors.E145.format(param="alias length"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
 | 
					    cdef int read_alias_header(
 | 
				
			||||||
 | 
					        self, hash_t* alias_hash, int64_t* candidate_length
 | 
				
			||||||
 | 
					    ) except -1:
 | 
				
			||||||
        status = self._read(alias_hash, sizeof(hash_t))
 | 
					        status = self._read(alias_hash, sizeof(hash_t))
 | 
				
			||||||
        if status < 1:
 | 
					        if status < 1:
 | 
				
			||||||
            if feof(self._fp):
 | 
					            if feof(self._fp):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AfrikaansDefaults(BaseDefaults):
 | 
					class AfrikaansDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,11 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...attrs import LANG
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AmharicDefaults(BaseDefaults):
 | 
					class AmharicDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,11 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import UNITS, ALPHA_UPPER
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 | 
					_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ArabicDefaults(BaseDefaults):
 | 
					class ArabicDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,11 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import UNITS, ALPHA_UPPER
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					_suffixes = (
 | 
				
			||||||
    LIST_PUNCT
 | 
					    LIST_PUNCT
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AzerbaijaniDefaults(BaseDefaults):
 | 
					class AzerbaijaniDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Eleven, twelve etc. are written separate: on bir, on iki
 | 
					# Eleven, twelve etc. are written separate: on bir, on iki
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,14 @@
 | 
				
			||||||
 | 
					from ...attrs import LANG
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..punctuation import (
 | 
				
			||||||
 | 
					    COMBINING_DIACRITICS_TOKENIZER_INFIXES,
 | 
				
			||||||
 | 
					    COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...attrs import LANG
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BulgarianDefaults(BaseDefaults):
 | 
					class BulgarianDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "нула",
 | 
					    "нула",
 | 
				
			||||||
    "едно",
 | 
					    "едно",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,8 +4,7 @@ References:
 | 
				
			||||||
    (countries, occupations, fields of studies and more).
 | 
					    (countries, occupations, fields of studies and more).
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,12 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from ...pipeline import Lemmatizer
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BengaliDefaults(BaseDefaults):
 | 
					class BengaliDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,14 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    HYPHENS,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_currency = r"\$¢£€¥฿৳"
 | 
					_currency = r"\$¢£€¥฿৳"
 | 
				
			||||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
from .lemmatizer import CatalanLemmatizer
 | 
					from .lemmatizer import CatalanLemmatizer
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CatalanDefaults(BaseDefaults):
 | 
					class CatalanDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "zero",
 | 
					    "zero",
 | 
				
			||||||
    "un",
 | 
					    "un",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,18 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import LIST_CURRENCY
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CURRENCY
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
from ..char_classes import merge_chars, _units
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    PUNCT,
 | 
				
			||||||
 | 
					    _units,
 | 
				
			||||||
 | 
					    merge_chars,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
					ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,8 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PROPN
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class CzechDefaults(BaseDefaults):
 | 
					class CzechDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DanishDefaults(BaseDefaults):
 | 
					class DanishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,5 @@
 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Source http://fjern-uv.dk/tal.php
 | 
					# Source http://fjern-uv.dk/tal.php
 | 
				
			||||||
_num_words = """nul
 | 
					_num_words = """nul
 | 
				
			||||||
en et to tre fire fem seks syv otte ni ti
 | 
					en et to tre fire fem seks syv otte ni ti
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,13 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from ..punctuation import TOKENIZER_SUFFIXES
 | 
					from ..punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					_infixes = (
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,8 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import AUX, NOUN, PRON, PROPN, VERB
 | 
				
			||||||
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
					def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,10 +2,9 @@
 | 
				
			||||||
Tokenizer Exceptions.
 | 
					Tokenizer Exceptions.
 | 
				
			||||||
Source: https://forkortelse.dk/ and various others.
 | 
					Source: https://forkortelse.dk/ and various others.
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GermanDefaults(BaseDefaults):
 | 
					class GermanDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,18 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import CURRENCY, UNITS, PUNCT
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    PUNCT,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
					from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
 | 
					_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					_suffixes = (
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {
 | 
					_exc = {
 | 
				
			||||||
    "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
 | 
					    "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class LowerSorbianDefaults(BaseDefaults):
 | 
					class LowerSorbianDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from .lemmatizer import GreekLemmatizer
 | 
					from .lemmatizer import GreekLemmatizer
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GreekDefaults(BaseDefaults):
 | 
					class GreekDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
def get_pos_from_wiktionary():
 | 
					def get_pos_from_wiktionary():
 | 
				
			||||||
    import re
 | 
					    import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    from gensim.corpora.wikicorpus import extract_pages
 | 
					    from gensim.corpora.wikicorpus import extract_pages
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    regex = re.compile(r"==={{(\w+)\|el}}===")
 | 
					    regex = re.compile(r"==={{(\w+)\|el}}===")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,16 @@
 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
 | 
					    ALPHA,
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    HYPHENS,
 | 
				
			||||||
 | 
					    LIST_CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_units = (
 | 
					_units = (
 | 
				
			||||||
    "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
 | 
					    "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc = {}
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,13 +1,14 @@
 | 
				
			||||||
from typing import Optional, Callable
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					 | 
				
			||||||
from .punctuation import TOKENIZER_INFIXES
 | 
					 | 
				
			||||||
from .lemmatizer import EnglishLemmatizer
 | 
					from .lemmatizer import EnglishLemmatizer
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class EnglishDefaults(BaseDefaults):
 | 
					class EnglishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,12 @@
 | 
				
			||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 | 
					from ..char_classes import (
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    HYPHENS,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_infixes = (
 | 
					_infixes = (
 | 
				
			||||||
    LIST_ELLIPSES
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
from typing import Union, Iterator, Tuple
 | 
					from typing import Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...symbols import NOUN, PROPN, PRON
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PRON, PROPN
 | 
				
			||||||
from ...tokens import Doc, Span
 | 
					from ...tokens import Doc, Span
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,8 @@
 | 
				
			||||||
from typing import Dict, List
 | 
					from typing import Dict, List
 | 
				
			||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
					 | 
				
			||||||
from ...symbols import ORTH, NORM
 | 
					 | 
				
			||||||
from ...util import update_exc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc: Dict[str, List[Dict]] = {}
 | 
					_exc: Dict[str, List[Dict]] = {}
 | 
				
			||||||
_exclude = [
 | 
					_exclude = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user