mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'v4' into feature/multiple-code-files
This commit is contained in:
		
						commit
						28c8a577fc
					
				
							
								
								
									
										129
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										129
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -1,129 +0,0 @@ | ||||||
| parameters: |  | ||||||
|   python_version: '' |  | ||||||
|   architecture: 'x64' |  | ||||||
|   num_build_jobs: 2 |  | ||||||
| 
 |  | ||||||
| steps: |  | ||||||
|   - task: UsePythonVersion@0 |  | ||||||
|     inputs: |  | ||||||
|       versionSpec: ${{ parameters.python_version }} |  | ||||||
|       architecture: ${{ parameters.architecture }} |  | ||||||
|       allowUnstable: true |  | ||||||
| 
 |  | ||||||
|   - bash: | |  | ||||||
|       echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" |  | ||||||
|     displayName: 'Set variables' |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m pip install -U build pip setuptools |  | ||||||
|       python -m pip install -U -r requirements.txt |  | ||||||
|     displayName: "Install dependencies" |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m build --sdist |  | ||||||
|     displayName: "Build sdist" |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m mypy spacy |  | ||||||
|     displayName: 'Run mypy' |  | ||||||
|     condition: ne(variables['python_version'], '3.6') |  | ||||||
| 
 |  | ||||||
|   - task: DeleteFiles@1 |  | ||||||
|     inputs: |  | ||||||
|       contents: "spacy" |  | ||||||
|     displayName: "Delete source directory" |  | ||||||
| 
 |  | ||||||
|   - task: DeleteFiles@1 |  | ||||||
|     inputs: |  | ||||||
|       contents: "*.egg-info" |  | ||||||
|     displayName: "Delete egg-info directory" |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m pip freeze > installed.txt |  | ||||||
|       python -m pip uninstall -y -r installed.txt |  | ||||||
|     displayName: "Uninstall all packages" |  | ||||||
| 
 |  | ||||||
|   - bash: | |  | ||||||
|       SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) |  | ||||||
|       SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST |  | ||||||
|     displayName: "Install from sdist" |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -W error -c "import spacy" |  | ||||||
|     displayName: "Test import" |  | ||||||
| 
 |  | ||||||
| #  - script: | |  | ||||||
| #      python -m spacy download ca_core_news_sm |  | ||||||
| #      python -m spacy download ca_core_news_md |  | ||||||
| #      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" |  | ||||||
| #    displayName: 'Test download CLI' |  | ||||||
| #    condition: eq(variables['python_version'], '3.8') |  | ||||||
| # |  | ||||||
| #  - script: | |  | ||||||
| #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" |  | ||||||
| #    displayName: 'Test no warnings on load (#11713)' |  | ||||||
| #    condition: eq(variables['python_version'], '3.8') |  | ||||||
| # |  | ||||||
| #  - script: | |  | ||||||
| #      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping |  | ||||||
| #    displayName: 'Test skip re-download (#12188)' |  | ||||||
| #    condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
| #  - script: | |  | ||||||
| #      python -W error -m spacy info ca_core_news_sm | grep -q download_url |  | ||||||
| #    displayName: 'Test download_url in info CLI' |  | ||||||
| #    condition: eq(variables['python_version'] '3.8') |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . |  | ||||||
|     displayName: 'Test convert CLI' |  | ||||||
|     condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m spacy init config -p ner -l ca ner.cfg |  | ||||||
|       python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy |  | ||||||
|     displayName: 'Test debug config CLI' |  | ||||||
|     condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       # will have errors due to sparse data, check for summary in output |  | ||||||
|       python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary |  | ||||||
|     displayName: 'Test debug data CLI' |  | ||||||
|     condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 |  | ||||||
|     displayName: 'Test train CLI' |  | ||||||
|     condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
| #  - script: | |  | ||||||
| #      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" |  | ||||||
| #      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir |  | ||||||
| #    displayName: 'Test assemble CLI' |  | ||||||
| #    condition: eq(variables['python_version'], '3.8') |  | ||||||
| # |  | ||||||
| #  - script: | |  | ||||||
| #      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" |  | ||||||
| #      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 |  | ||||||
| #    displayName: 'Test assemble CLI vectors warning' |  | ||||||
| #    condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m pip install -U -r requirements.txt |  | ||||||
|     displayName: "Install test requirements" |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m pytest --pyargs spacy -W error |  | ||||||
|     displayName: "Run CPU tests" |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python -m pip install 'spacy[apple]' |  | ||||||
|       python -m pytest --pyargs spacy |  | ||||||
|     displayName: "Run CPU tests with thinc-apple-ops" |  | ||||||
|     condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) |  | ||||||
| 
 |  | ||||||
|   - script: | |  | ||||||
|       python .github/validate_universe_json.py website/meta/universe.json |  | ||||||
|     displayName: 'Test website/meta/universe.json' |  | ||||||
|     condition: eq(variables['python_version'], '3.8') |  | ||||||
| 
 |  | ||||||
							
								
								
									
										45
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										45
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -1,45 +0,0 @@ | ||||||
| # GitHub Action that uses Black to reformat all Python code and submits a PR |  | ||||||
| # in regular intervals. Inspired by: https://github.com/cclauss/autoblack |  | ||||||
| 
 |  | ||||||
| name: autoblack |  | ||||||
| on: |  | ||||||
|   workflow_dispatch:  # allow manual trigger |  | ||||||
|   schedule: |  | ||||||
|     - cron: '0 8 * * 5'  # every Friday at 8am UTC |  | ||||||
| 
 |  | ||||||
| jobs: |  | ||||||
|   autoblack: |  | ||||||
|     if: github.repository_owner == 'explosion' |  | ||||||
|     runs-on: ubuntu-latest |  | ||||||
|     steps: |  | ||||||
|       - uses: actions/checkout@v3 |  | ||||||
|         with: |  | ||||||
|             ref: ${{ github.head_ref }} |  | ||||||
|       - uses: actions/setup-python@v4 |  | ||||||
|       - run: pip install black -c requirements.txt |  | ||||||
|       - name: Auto-format code if needed |  | ||||||
|         run: black spacy |  | ||||||
|       # We can't run black --check here because that returns a non-zero excit |  | ||||||
|       # code and makes GitHub think the action failed |  | ||||||
|       - name: Check for modified files |  | ||||||
|         id: git-check |  | ||||||
|         run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT |  | ||||||
| 
 |  | ||||||
|       - name: Create Pull Request |  | ||||||
|         if: steps.git-check.outputs.modified == 'true' |  | ||||||
|         uses: peter-evans/create-pull-request@v4 |  | ||||||
|         with: |  | ||||||
|             title: Auto-format code with black |  | ||||||
|             labels: meta |  | ||||||
|             commit-message: Auto-format code with black |  | ||||||
|             committer: GitHub <noreply@github.com> |  | ||||||
|             author: explosion-bot <explosion-bot@users.noreply.github.com> |  | ||||||
|             body: _This PR is auto-generated._ |  | ||||||
|             branch: autoblack |  | ||||||
|             delete-branch: true |  | ||||||
|             draft: false |  | ||||||
|       - name: Check outputs |  | ||||||
|         if: steps.git-check.outputs.modified == 'true' |  | ||||||
|         run: | |  | ||||||
|           echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" |  | ||||||
|           echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" |  | ||||||
							
								
								
									
										1
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -8,6 +8,7 @@ on: | ||||||
| 
 | 
 | ||||||
| jobs: | jobs: | ||||||
|   explosion-bot: |   explosion-bot: | ||||||
|  |     if: github.repository_owner == 'explosion' | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - name: Dump GitHub context |       - name: Dump GitHub context | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -13,6 +13,7 @@ on: | ||||||
| 
 | 
 | ||||||
| jobs: | jobs: | ||||||
|   issue-manager: |   issue-manager: | ||||||
|  |     if: github.repository_owner == 'explosion' | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - uses: tiangolo/issue-manager@0.4.0 |       - uses: tiangolo/issue-manager@0.4.0 | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -13,6 +13,7 @@ concurrency: | ||||||
| 
 | 
 | ||||||
| jobs: | jobs: | ||||||
|   action: |   action: | ||||||
|  |     if: github.repository_owner == 'explosion' | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - uses: dessant/lock-threads@v4 |       - uses: dessant/lock-threads@v4 | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -7,6 +7,7 @@ on: | ||||||
| 
 | 
 | ||||||
| jobs: | jobs: | ||||||
|   build: |   build: | ||||||
|  |     if: github.repository_owner == 'explosion' | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
| 
 | 
 | ||||||
|     steps: |     steps: | ||||||
|  |  | ||||||
							
								
								
									
										173
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										173
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,173 @@ | ||||||
|  | name: tests | ||||||
|  | 
 | ||||||
|  | on: | ||||||
|  |   push: | ||||||
|  |     branches-ignore: | ||||||
|  |       - "spacy.io" | ||||||
|  |       - "nightly.spacy.io" | ||||||
|  |       - "v2.spacy.io" | ||||||
|  |     paths-ignore: | ||||||
|  |       - "*.md" | ||||||
|  |       - "*.mdx" | ||||||
|  |       - "website/**" | ||||||
|  |       - ".github/workflows/**" | ||||||
|  |   pull_request: | ||||||
|  |     types: [opened, synchronize, reopened, edited] | ||||||
|  |     paths-ignore: | ||||||
|  |       - "*.md" | ||||||
|  |       - "*.mdx" | ||||||
|  |       - "website/**" | ||||||
|  | 
 | ||||||
|  | jobs: | ||||||
|  |   validate: | ||||||
|  |     name: Validate | ||||||
|  |     if: github.repository_owner == 'explosion' | ||||||
|  |     runs-on: ubuntu-latest | ||||||
|  |     steps: | ||||||
|  |       - name: Check out repo | ||||||
|  |         uses: actions/checkout@v3 | ||||||
|  | 
 | ||||||
|  |       - name: Configure Python version | ||||||
|  |         uses: actions/setup-python@v4 | ||||||
|  |         with: | ||||||
|  |           python-version: "3.8" | ||||||
|  |           architecture: x64 | ||||||
|  | 
 | ||||||
|  |       - name: black | ||||||
|  |         run: | | ||||||
|  |           python -m pip install black -c requirements.txt | ||||||
|  |           python -m black spacy --check | ||||||
|  |       - name: isort | ||||||
|  |         run: | | ||||||
|  |           python -m pip install isort -c requirements.txt | ||||||
|  |           python -m isort spacy --check | ||||||
|  |       - name: flake8 | ||||||
|  |         run: | | ||||||
|  |           python -m pip install flake8==5.0.4 | ||||||
|  |           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics | ||||||
|  |   tests: | ||||||
|  |     name: Test | ||||||
|  |     needs: Validate | ||||||
|  |     strategy: | ||||||
|  |       fail-fast: true | ||||||
|  |       matrix: | ||||||
|  |         os: [ubuntu-latest, windows-latest, macos-latest] | ||||||
|  |         python_version: ["3.11"] | ||||||
|  |         include: | ||||||
|  |           - os: macos-latest | ||||||
|  |             python_version: "3.8" | ||||||
|  |           - os: ubuntu-20.04 | ||||||
|  |             python_version: "3.9" | ||||||
|  |           - os: windows-latest | ||||||
|  |             python_version: "3.10" | ||||||
|  | 
 | ||||||
|  |     runs-on: ${{ matrix.os }} | ||||||
|  | 
 | ||||||
|  |     steps: | ||||||
|  |       - name: Check out repo | ||||||
|  |         uses: actions/checkout@v3 | ||||||
|  | 
 | ||||||
|  |       - name: Configure Python version | ||||||
|  |         uses: actions/setup-python@v4 | ||||||
|  |         with: | ||||||
|  |           python-version: ${{ matrix.python_version }} | ||||||
|  |           architecture: x64 | ||||||
|  | 
 | ||||||
|  |       - name: Install dependencies | ||||||
|  |         run: | | ||||||
|  |           python -m pip install -U build pip setuptools | ||||||
|  |           python -m pip install -U -r requirements.txt | ||||||
|  | 
 | ||||||
|  |       - name: Build sdist | ||||||
|  |         run: | | ||||||
|  |           python -m build --sdist | ||||||
|  | 
 | ||||||
|  |       - name: Run mypy | ||||||
|  |         run: | | ||||||
|  |           python -m mypy spacy | ||||||
|  | 
 | ||||||
|  |       - name: Delete source directory and .egg-info | ||||||
|  |         run: | | ||||||
|  |           rm -rf spacy *.egg-info | ||||||
|  |         shell: bash | ||||||
|  | 
 | ||||||
|  |       - name: Uninstall all packages | ||||||
|  |         run: | | ||||||
|  |           python -m pip freeze | ||||||
|  |           python -m pip freeze --exclude pywin32 > installed.txt | ||||||
|  |           python -m pip uninstall -y -r installed.txt | ||||||
|  | 
 | ||||||
|  |       - name: Install from sdist | ||||||
|  |         run: | | ||||||
|  |           SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) | ||||||
|  |           SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST | ||||||
|  |         shell: bash | ||||||
|  | 
 | ||||||
|  |       - name: Test import | ||||||
|  |         run: python -W error -c "import spacy" | ||||||
|  | 
 | ||||||
|  |       #      - name: "Test download CLI" | ||||||
|  |       #        run: | | ||||||
|  |       #          python -m spacy download ca_core_news_sm | ||||||
|  |       #          python -m spacy download ca_core_news_md | ||||||
|  |       #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" | ||||||
|  |       #        if: matrix.python_version == '3.9' | ||||||
|  |       # | ||||||
|  |       #      - name: "Test download_url in info CLI" | ||||||
|  |       #        run: | | ||||||
|  |       #          python -W error -m spacy info ca_core_news_sm | grep -q download_url | ||||||
|  |       #        if: matrix.python_version == '3.9' | ||||||
|  |       # | ||||||
|  |       #      - name: "Test no warnings on load (#11713)" | ||||||
|  |       #        run: | | ||||||
|  |       #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" | ||||||
|  |       #        if: matrix.python_version == '3.9' | ||||||
|  | 
 | ||||||
|  |       - name: "Test convert CLI" | ||||||
|  |         run: | | ||||||
|  |           python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . | ||||||
|  |         if: matrix.python_version == '3.9' | ||||||
|  | 
 | ||||||
|  |       - name: "Test debug config CLI" | ||||||
|  |         run: | | ||||||
|  |           python -m spacy init config -p ner -l ca ner.cfg | ||||||
|  |           python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | ||||||
|  |         if: matrix.python_version == '3.9' | ||||||
|  | 
 | ||||||
|  |       - name: "Test debug data CLI" | ||||||
|  |         run: | | ||||||
|  |           # will have errors due to sparse data, check for summary in output | ||||||
|  |           python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary | ||||||
|  |         if: matrix.python_version == '3.9' | ||||||
|  | 
 | ||||||
|  |       - name: "Test train CLI" | ||||||
|  |         run: | | ||||||
|  |           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 | ||||||
|  |         if: matrix.python_version == '3.9' | ||||||
|  | 
 | ||||||
|  |       #      - name: "Test assemble CLI" | ||||||
|  |       #        run: | | ||||||
|  |       #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" | ||||||
|  |       #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir | ||||||
|  |       #        if: matrix.python_version == '3.9' | ||||||
|  |       # | ||||||
|  |       #      - name: "Test assemble CLI vectors warning" | ||||||
|  |       #        run: | | ||||||
|  |       #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" | ||||||
|  |       #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 | ||||||
|  |       #        if: matrix.python_version == '3.9' | ||||||
|  | 
 | ||||||
|  |       - name: "Install test requirements" | ||||||
|  |         run: | | ||||||
|  |           python -m pip install -U -r requirements.txt | ||||||
|  | 
 | ||||||
|  |       - name: "Run CPU tests" | ||||||
|  |         run: | | ||||||
|  |           python -m pytest --pyargs spacy -W error | ||||||
|  |         if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')" | ||||||
|  | 
 | ||||||
|  |       - name: "Run CPU tests with thinc-apple-ops" | ||||||
|  |         run: | | ||||||
|  |           python -m pip install 'spacy[apple]' | ||||||
|  |           python -m pytest --pyargs spacy | ||||||
|  |         if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11' | ||||||
							
								
								
									
										33
									
								
								.github/workflows/universe_validation.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.github/workflows/universe_validation.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | ||||||
|  | name: universe validation | ||||||
|  | 
 | ||||||
|  | on: | ||||||
|  |   push: | ||||||
|  |     branches-ignore: | ||||||
|  |       - "spacy.io" | ||||||
|  |       - "nightly.spacy.io" | ||||||
|  |       - "v2.spacy.io" | ||||||
|  |     paths: | ||||||
|  |       - "website/meta/universe.json" | ||||||
|  |   pull_request: | ||||||
|  |     types: [opened, synchronize, reopened, edited] | ||||||
|  |     paths: | ||||||
|  |       - "website/meta/universe.json" | ||||||
|  | 
 | ||||||
|  | jobs: | ||||||
|  |   validate: | ||||||
|  |     name: Validate | ||||||
|  |     if: github.repository_owner == 'explosion' | ||||||
|  |     runs-on: ubuntu-latest | ||||||
|  |     steps: | ||||||
|  |       - name: Check out repo | ||||||
|  |         uses: actions/checkout@v3 | ||||||
|  | 
 | ||||||
|  |       - name: Configure Python version | ||||||
|  |         uses: actions/setup-python@v4 | ||||||
|  |         with: | ||||||
|  |           python-version: "3.8" | ||||||
|  |           architecture: x64 | ||||||
|  | 
 | ||||||
|  |       - name: Validate website/meta/universe.json | ||||||
|  |         run: | | ||||||
|  |           python .github/validate_universe_json.py website/meta/universe.json | ||||||
|  | @ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy | ||||||
| model packaging, deployment and workflow management. spaCy is commercial | model packaging, deployment and workflow management. spaCy is commercial | ||||||
| open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). | open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). | ||||||
| 
 | 
 | ||||||
|  | 💥 **We'd love to hear more about your experience with spaCy!** | ||||||
|  | [Fill out our survey here.](https://form.typeform.com/to/aMel9q9f) | ||||||
|  | 
 | ||||||
| 💫 **Version 3.5 out now!** | 💫 **Version 3.5 out now!** | ||||||
| [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | ||||||
| 
 | 
 | ||||||
|  | @ -33,7 +36,7 @@ open-source software, released under the [MIT license](https://github.com/explos | ||||||
| ## 📖 Documentation | ## 📖 Documentation | ||||||
| 
 | 
 | ||||||
| | Documentation                 |                                                                        | | | Documentation                 |                                                                        | | ||||||
| | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ----------------------------- | ---------------------------------------------------------------------- | | ||||||
| | ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      | | | ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      | | ||||||
| | 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     | | | 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     | | ||||||
| | 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         | | | 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         | | ||||||
|  | @ -41,6 +44,7 @@ open-source software, released under the [MIT license](https://github.com/explos | ||||||
| | 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                | | | 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                | | ||||||
| | 📦 **[Models]**               | Download trained pipelines for spaCy.                                  | | | 📦 **[Models]**               | Download trained pipelines for spaCy.                                  | | ||||||
| | 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         | | | 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         | | ||||||
|  | | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. | | ||||||
| | 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | | 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | ||||||
| | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | ||||||
| | 🛠 **[Changelog]** | Changes and version history. | | | 🛠 **[Changelog]** | Changes and version history. | | ||||||
|  | @ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos | ||||||
| [api reference]: https://spacy.io/api/ | [api reference]: https://spacy.io/api/ | ||||||
| [models]: https://spacy.io/models | [models]: https://spacy.io/models | ||||||
| [universe]: https://spacy.io/universe | [universe]: https://spacy.io/universe | ||||||
|  | [spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode | ||||||
| [videos]: https://www.youtube.com/c/ExplosionAI | [videos]: https://www.youtube.com/c/ExplosionAI | ||||||
| [online course]: https://course.spacy.io | [online course]: https://course.spacy.io | ||||||
| [project templates]: https://github.com/explosion/projects | [project templates]: https://github.com/explosion/projects | ||||||
| [changelog]: https://spacy.io/usage#changelog | [changelog]: https://spacy.io/usage#changelog | ||||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| ## 💬 Where to ask questions | ## 💬 Where to ask questions | ||||||
| 
 | 
 | ||||||
| The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). | The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). | ||||||
|  |  | ||||||
|  | @ -1,99 +0,0 @@ | ||||||
| trigger: |  | ||||||
|   batch: true |  | ||||||
|   branches: |  | ||||||
|     include: |  | ||||||
|       - "*" |  | ||||||
|     exclude: |  | ||||||
|       - "spacy.io" |  | ||||||
|       - "nightly.spacy.io" |  | ||||||
|       - "v2.spacy.io" |  | ||||||
|   paths: |  | ||||||
|     exclude: |  | ||||||
|       - "website/*" |  | ||||||
|       - "*.md" |  | ||||||
|       - "*.mdx" |  | ||||||
|       - ".github/workflows/*" |  | ||||||
| pr: |  | ||||||
|   paths: |  | ||||||
|     exclude: |  | ||||||
|       - "*.md" |  | ||||||
|       - "*.mdx" |  | ||||||
|       - "website/docs/*" |  | ||||||
|       - "website/src/*" |  | ||||||
|       - "website/meta/*.tsx" |  | ||||||
|       - "website/meta/*.mjs" |  | ||||||
|       - "website/meta/languages.json" |  | ||||||
|       - "website/meta/site.json" |  | ||||||
|       - "website/meta/sidebars.json" |  | ||||||
|       - "website/meta/type-annotations.json" |  | ||||||
|       - "website/pages/*" |  | ||||||
|       - ".github/workflows/*" |  | ||||||
| 
 |  | ||||||
| jobs: |  | ||||||
|   # Check formatting and linting. Perform basic checks for most important errors |  | ||||||
|   # (syntax etc.) Uses the config defined in setup.cfg and overwrites the |  | ||||||
|   # selected codes. |  | ||||||
|   - job: "Validate" |  | ||||||
|     pool: |  | ||||||
|       vmImage: "ubuntu-latest" |  | ||||||
|     steps: |  | ||||||
|       - task: UsePythonVersion@0 |  | ||||||
|         inputs: |  | ||||||
|           versionSpec: "3.8" |  | ||||||
|       - script: | |  | ||||||
|           pip install black -c requirements.txt |  | ||||||
|           python -m black spacy --check |  | ||||||
|         displayName: "black" |  | ||||||
|       - script: | |  | ||||||
|           pip install flake8==5.0.4 |  | ||||||
|           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics |  | ||||||
|         displayName: "flake8" |  | ||||||
| 
 |  | ||||||
|   - job: "Test" |  | ||||||
|     dependsOn: "Validate" |  | ||||||
|     strategy: |  | ||||||
|       matrix: |  | ||||||
|         # We're only running one platform per Python version to speed up builds |  | ||||||
|         #        Python38Linux: |  | ||||||
|         #          imageName: "ubuntu-latest" |  | ||||||
|         #          python.version: "3.8" |  | ||||||
|         #        Python38Windows: |  | ||||||
|         #          imageName: "windows-latest" |  | ||||||
|         #          python.version: "3.8" |  | ||||||
|         Python38Mac: |  | ||||||
|           imageName: "macos-latest" |  | ||||||
|           python.version: "3.8" |  | ||||||
|         Python39Linux: |  | ||||||
|           imageName: "ubuntu-latest" |  | ||||||
|           python.version: "3.9" |  | ||||||
|         #        Python39Windows: |  | ||||||
|         #          imageName: "windows-latest" |  | ||||||
|         #          python.version: "3.9" |  | ||||||
|         #        Python39Mac: |  | ||||||
|         #          imageName: "macos-latest" |  | ||||||
|         #          python.version: "3.9" |  | ||||||
|         #        Python310Linux: |  | ||||||
|         #          imageName: "ubuntu-latest" |  | ||||||
|         #          python.version: "3.10" |  | ||||||
|         Python310Windows: |  | ||||||
|           imageName: "windows-latest" |  | ||||||
|           python.version: "3.10" |  | ||||||
|         #        Python310Mac: |  | ||||||
|         #          imageName: "macos-latest" |  | ||||||
|         #          python.version: "3.10" |  | ||||||
|         Python311Linux: |  | ||||||
|           imageName: 'ubuntu-latest' |  | ||||||
|           python.version: '3.11' |  | ||||||
|         Python311Windows: |  | ||||||
|           imageName: 'windows-latest' |  | ||||||
|           python.version: '3.11' |  | ||||||
|         Python311Mac: |  | ||||||
|           imageName: 'macos-latest' |  | ||||||
|           python.version: '3.11' |  | ||||||
|       maxParallel: 4 |  | ||||||
|     pool: |  | ||||||
|       vmImage: $(imageName) |  | ||||||
|     steps: |  | ||||||
|       - template: .github/azure-steps.yml |  | ||||||
|         parameters: |  | ||||||
|           python_version: '$(python.version)' |  | ||||||
|  | @ -1,6 +1,4 @@ | ||||||
| # build version constraints for use with wheelwright + multibuild | # build version constraints for use with wheelwright + multibuild | ||||||
| numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64' |  | ||||||
| numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64' |  | ||||||
| numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' | numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' | ||||||
| numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' | numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' | ||||||
| numpy==1.19.3; python_version=='3.9' | numpy==1.19.3; python_version=='3.9' | ||||||
|  |  | ||||||
|  | @ -9,3 +9,6 @@ requires = [ | ||||||
|     "numpy>=1.15.0", |     "numpy>=1.15.0", | ||||||
| ] | ] | ||||||
| build-backend = "setuptools.build_meta" | build-backend = "setuptools.build_meta" | ||||||
|  | 
 | ||||||
|  | [tool.isort] | ||||||
|  | profile = "black" | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 | ||||||
| wasabi>=0.9.1,<1.2.0 | wasabi>=0.9.1,<1.2.0 | ||||||
| srsly>=2.4.3,<3.0.0 | srsly>=2.4.3,<3.0.0 | ||||||
| catalogue>=2.0.6,<2.1.0 | catalogue>=2.0.6,<2.1.0 | ||||||
| typer>=0.3.0,<0.8.0 | typer>=0.3.0,<0.10.0 | ||||||
| pathy>=0.10.0 | pathy>=0.10.0 | ||||||
| smart-open>=5.2.1,<7.0.0 | smart-open>=5.2.1,<7.0.0 | ||||||
| # Third party dependencies | # Third party dependencies | ||||||
|  | @ -30,10 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 | ||||||
| mock>=2.0.0,<3.0.0 | mock>=2.0.0,<3.0.0 | ||||||
| flake8>=3.8.0,<6.0.0 | flake8>=3.8.0,<6.0.0 | ||||||
| hypothesis>=3.27.0,<7.0.0 | hypothesis>=3.27.0,<7.0.0 | ||||||
| mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" | mypy>=0.990,<1.1.0; platform_machine != "aarch64" | ||||||
| types-dataclasses>=0.1.3; python_version < "3.7" |  | ||||||
| types-mock>=0.1.1 | types-mock>=0.1.1 | ||||||
| types-setuptools>=57.0.0 | types-setuptools>=57.0.0 | ||||||
| types-requests | types-requests | ||||||
| types-setuptools>=57.0.0 | types-setuptools>=57.0.0 | ||||||
| black==22.3.0 | black==22.3.0 | ||||||
|  | isort>=5.0,<6.0 | ||||||
|  |  | ||||||
							
								
								
									
										46
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								setup.cfg
									
									
									
									
									
								
							|  | @ -30,6 +30,14 @@ project_urls = | ||||||
| zip_safe = false | zip_safe = false | ||||||
| include_package_data = true | include_package_data = true | ||||||
| python_requires = >=3.8 | python_requires = >=3.8 | ||||||
|  | setup_requires = | ||||||
|  |     cython>=0.25,<3.0 | ||||||
|  |     numpy>=1.15.0 | ||||||
|  |     # We also need our Cython packages here to compile against | ||||||
|  |     cymem>=2.0.2,<2.1.0 | ||||||
|  |     preshed>=3.0.2,<3.1.0 | ||||||
|  |     murmurhash>=0.28.0,<1.1.0 | ||||||
|  |     thinc>=9.0.0.dev2,<9.1.0 | ||||||
| install_requires = | install_requires = | ||||||
|     # Our libraries |     # Our libraries | ||||||
|     spacy-legacy>=4.0.0.dev0,<4.1.0 |     spacy-legacy>=4.0.0.dev0,<4.1.0 | ||||||
|  | @ -42,7 +50,7 @@ install_requires = | ||||||
|     srsly>=2.4.3,<3.0.0 |     srsly>=2.4.3,<3.0.0 | ||||||
|     catalogue>=2.0.6,<2.1.0 |     catalogue>=2.0.6,<2.1.0 | ||||||
|     # Third-party dependencies |     # Third-party dependencies | ||||||
|     typer>=0.3.0,<0.8.0 |     typer>=0.3.0,<0.10.0 | ||||||
|     pathy>=0.10.0 |     pathy>=0.10.0 | ||||||
|     smart-open>=5.2.1,<7.0.0 |     smart-open>=5.2.1,<7.0.0 | ||||||
|     tqdm>=4.38.0,<5.0.0 |     tqdm>=4.38.0,<5.0.0 | ||||||
|  | @ -67,41 +75,41 @@ transformers = | ||||||
| ray = | ray = | ||||||
|     spacy_ray>=0.1.0,<1.0.0 |     spacy_ray>=0.1.0,<1.0.0 | ||||||
| cuda = | cuda = | ||||||
|     cupy>=5.0.0b4,<12.0.0 |     cupy>=5.0.0b4,<13.0.0 | ||||||
| cuda80 = | cuda80 = | ||||||
|     cupy-cuda80>=5.0.0b4,<12.0.0 |     cupy-cuda80>=5.0.0b4,<13.0.0 | ||||||
| cuda90 = | cuda90 = | ||||||
|     cupy-cuda90>=5.0.0b4,<12.0.0 |     cupy-cuda90>=5.0.0b4,<13.0.0 | ||||||
| cuda91 = | cuda91 = | ||||||
|     cupy-cuda91>=5.0.0b4,<12.0.0 |     cupy-cuda91>=5.0.0b4,<13.0.0 | ||||||
| cuda92 = | cuda92 = | ||||||
|     cupy-cuda92>=5.0.0b4,<12.0.0 |     cupy-cuda92>=5.0.0b4,<13.0.0 | ||||||
| cuda100 = | cuda100 = | ||||||
|     cupy-cuda100>=5.0.0b4,<12.0.0 |     cupy-cuda100>=5.0.0b4,<13.0.0 | ||||||
| cuda101 = | cuda101 = | ||||||
|     cupy-cuda101>=5.0.0b4,<12.0.0 |     cupy-cuda101>=5.0.0b4,<13.0.0 | ||||||
| cuda102 = | cuda102 = | ||||||
|     cupy-cuda102>=5.0.0b4,<12.0.0 |     cupy-cuda102>=5.0.0b4,<13.0.0 | ||||||
| cuda110 = | cuda110 = | ||||||
|     cupy-cuda110>=5.0.0b4,<12.0.0 |     cupy-cuda110>=5.0.0b4,<13.0.0 | ||||||
| cuda111 = | cuda111 = | ||||||
|     cupy-cuda111>=5.0.0b4,<12.0.0 |     cupy-cuda111>=5.0.0b4,<13.0.0 | ||||||
| cuda112 = | cuda112 = | ||||||
|     cupy-cuda112>=5.0.0b4,<12.0.0 |     cupy-cuda112>=5.0.0b4,<13.0.0 | ||||||
| cuda113 = | cuda113 = | ||||||
|     cupy-cuda113>=5.0.0b4,<12.0.0 |     cupy-cuda113>=5.0.0b4,<13.0.0 | ||||||
| cuda114 = | cuda114 = | ||||||
|     cupy-cuda114>=5.0.0b4,<12.0.0 |     cupy-cuda114>=5.0.0b4,<13.0.0 | ||||||
| cuda115 = | cuda115 = | ||||||
|     cupy-cuda115>=5.0.0b4,<12.0.0 |     cupy-cuda115>=5.0.0b4,<13.0.0 | ||||||
| cuda116 = | cuda116 = | ||||||
|     cupy-cuda116>=5.0.0b4,<12.0.0 |     cupy-cuda116>=5.0.0b4,<13.0.0 | ||||||
| cuda117 = | cuda117 = | ||||||
|     cupy-cuda117>=5.0.0b4,<12.0.0 |     cupy-cuda117>=5.0.0b4,<13.0.0 | ||||||
| cuda11x = | cuda11x = | ||||||
|     cupy-cuda11x>=11.0.0,<12.0.0 |     cupy-cuda11x>=11.0.0,<13.0.0 | ||||||
| cuda-autodetect = | cuda-autodetect = | ||||||
|     cupy-wheel>=11.0.0,<12.0.0 |     cupy-wheel>=11.0.0,<13.0.0 | ||||||
| apple = | apple = | ||||||
|     thinc-apple-ops>=0.1.0.dev0,<1.0.0 |     thinc-apple-ops>=0.1.0.dev0,<1.0.0 | ||||||
| # Language tokenizers with external dependencies | # Language tokenizers with external dependencies | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| from typing import Union, Iterable, Dict, Any |  | ||||||
| from pathlib import Path |  | ||||||
| import sys | import sys | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, Iterable, Union | ||||||
| 
 | 
 | ||||||
| # set library-specific custom warning handling before doing anything else | # set library-specific custom warning handling before doing anything else | ||||||
| from .errors import setup_default_warnings | from .errors import setup_default_warnings | ||||||
|  | @ -8,20 +8,17 @@ from .errors import setup_default_warnings | ||||||
| setup_default_warnings()  # noqa: E402 | setup_default_warnings()  # noqa: E402 | ||||||
| 
 | 
 | ||||||
| # These are imported as part of the API | # These are imported as part of the API | ||||||
| from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401 | from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401 | ||||||
| from thinc.api import Config |  | ||||||
| 
 | 
 | ||||||
| from . import pipeline  # noqa: F401 | from . import pipeline  # noqa: F401 | ||||||
| from .cli.info import info  # noqa: F401 |  | ||||||
| from .glossary import explain  # noqa: F401 |  | ||||||
| from .about import __version__  # noqa: F401 |  | ||||||
| from .util import registry, logger  # noqa: F401 |  | ||||||
| 
 |  | ||||||
| from .errors import Errors |  | ||||||
| from .language import Language |  | ||||||
| from .vocab import Vocab |  | ||||||
| from . import util | from . import util | ||||||
| 
 | from .about import __version__  # noqa: F401 | ||||||
|  | from .cli.info import info  # noqa: F401 | ||||||
|  | from .errors import Errors | ||||||
|  | from .glossary import explain  # noqa: F401 | ||||||
|  | from .language import Language | ||||||
|  | from .util import logger, registry  # noqa: F401 | ||||||
|  | from .vocab import Vocab | ||||||
| 
 | 
 | ||||||
| if sys.maxunicode == 65535: | if sys.maxunicode == 65535: | ||||||
|     raise SystemError(Errors.E130) |     raise SystemError(Errors.E130) | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| # fmt: off | # fmt: off | ||||||
| __title__ = "spacy" | __title__ = "spacy" | ||||||
| __version__ = "4.0.0.dev0" | __version__ = "4.0.0.dev1" | ||||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||||
| __projects__ = "https://github.com/explosion/projects" | __projects__ = "https://github.com/explosion/projects" | ||||||
|  |  | ||||||
|  | @ -1,5 +1,6 @@ | ||||||
| from . cimport symbols | from . cimport symbols | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| cdef enum attr_id_t: | cdef enum attr_id_t: | ||||||
|     NULL_ATTR = 0 |     NULL_ATTR = 0 | ||||||
|     IS_ALPHA = symbols.IS_ALPHA |     IS_ALPHA = symbols.IS_ALPHA | ||||||
|  |  | ||||||
|  | @ -1,35 +1,35 @@ | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, setup_cli  # noqa: F401 | from ._util import app, setup_cli  # noqa: F401 | ||||||
|  | from .apply import apply  # noqa: F401 | ||||||
|  | from .assemble import assemble_cli  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| # These are the actual functions, NOT the wrapped CLI commands. The CLI commands | # These are the actual functions, NOT the wrapped CLI commands. The CLI commands | ||||||
| # are registered automatically and won't have to be imported here. | # are registered automatically and won't have to be imported here. | ||||||
| from .benchmark_speed import benchmark_speed_cli  # noqa: F401 | from .benchmark_speed import benchmark_speed_cli  # noqa: F401 | ||||||
| from .download import download  # noqa: F401 |  | ||||||
| from .info import info  # noqa: F401 |  | ||||||
| from .package import package  # noqa: F401 |  | ||||||
| from .profile import profile  # noqa: F401 |  | ||||||
| from .train import train_cli  # noqa: F401 |  | ||||||
| from .assemble import assemble_cli  # noqa: F401 |  | ||||||
| from .pretrain import pretrain  # noqa: F401 |  | ||||||
| from .debug_data import debug_data  # noqa: F401 |  | ||||||
| from .debug_config import debug_config  # noqa: F401 |  | ||||||
| from .debug_model import debug_model  # noqa: F401 |  | ||||||
| from .debug_diff import debug_diff  # noqa: F401 |  | ||||||
| from .evaluate import evaluate  # noqa: F401 |  | ||||||
| from .apply import apply  # noqa: F401 |  | ||||||
| from .convert import convert  # noqa: F401 | from .convert import convert  # noqa: F401 | ||||||
| from .init_pipeline import init_pipeline_cli  # noqa: F401 | from .debug_config import debug_config  # noqa: F401 | ||||||
| from .init_config import init_config, fill_config  # noqa: F401 | from .debug_data import debug_data  # noqa: F401 | ||||||
| from .validate import validate  # noqa: F401 | from .debug_diff import debug_diff  # noqa: F401 | ||||||
| from .project.clone import project_clone  # noqa: F401 | from .debug_model import debug_model  # noqa: F401 | ||||||
| from .project.assets import project_assets  # noqa: F401 | from .download import download  # noqa: F401 | ||||||
| from .project.run import project_run  # noqa: F401 | from .evaluate import evaluate  # noqa: F401 | ||||||
| from .project.dvc import project_update_dvc  # noqa: F401 |  | ||||||
| from .project.push import project_push  # noqa: F401 |  | ||||||
| from .project.pull import project_pull  # noqa: F401 |  | ||||||
| from .project.document import project_document  # noqa: F401 |  | ||||||
| from .find_threshold import find_threshold  # noqa: F401 | from .find_threshold import find_threshold  # noqa: F401 | ||||||
|  | from .info import info  # noqa: F401 | ||||||
|  | from .init_config import fill_config, init_config  # noqa: F401 | ||||||
|  | from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||||
|  | from .package import package  # noqa: F401 | ||||||
|  | from .pretrain import pretrain  # noqa: F401 | ||||||
|  | from .profile import profile  # noqa: F401 | ||||||
|  | from .project.assets import project_assets  # noqa: F401 | ||||||
|  | from .project.clone import project_clone  # noqa: F401 | ||||||
|  | from .project.document import project_document  # noqa: F401 | ||||||
|  | from .project.dvc import project_update_dvc  # noqa: F401 | ||||||
|  | from .project.pull import project_pull  # noqa: F401 | ||||||
|  | from .project.push import project_push  # noqa: F401 | ||||||
|  | from .project.run import project_run  # noqa: F401 | ||||||
|  | from .train import train_cli  # noqa: F401 | ||||||
|  | from .validate import validate  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||||
|  |  | ||||||
|  | @ -1,26 +1,45 @@ | ||||||
| from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal |  | ||||||
| from typing import TYPE_CHECKING, overload |  | ||||||
| import sys |  | ||||||
| import shutil |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg, Printer |  | ||||||
| import srsly |  | ||||||
| import hashlib | import hashlib | ||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  | import sys | ||||||
|  | from configparser import InterpolationError | ||||||
|  | from contextlib import contextmanager | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import ( | ||||||
|  |     TYPE_CHECKING, | ||||||
|  |     Any, | ||||||
|  |     Dict, | ||||||
|  |     Iterable, | ||||||
|  |     List, | ||||||
|  |     Literal, | ||||||
|  |     Optional, | ||||||
|  |     Tuple, | ||||||
|  |     Union, | ||||||
|  |     overload, | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | import srsly | ||||||
| import typer | import typer | ||||||
| from click import NoSuchOption | from click import NoSuchOption | ||||||
| from click.parser import split_arg_string | from click.parser import split_arg_string | ||||||
| from typer.main import get_command |  | ||||||
| from contextlib import contextmanager |  | ||||||
| from thinc.api import Config, ConfigValidationError, require_gpu | from thinc.api import Config, ConfigValidationError, require_gpu | ||||||
| from thinc.util import gpu_is_available | from thinc.util import gpu_is_available | ||||||
| from configparser import InterpolationError | from typer.main import get_command | ||||||
| import os | from wasabi import Printer, msg | ||||||
| 
 | 
 | ||||||
| from ..schemas import ProjectConfigSchema, validate |  | ||||||
| from ..util import import_file, run_command, make_tempdir, registry, logger |  | ||||||
| from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS |  | ||||||
| from ..errors import RENAMED_LANGUAGE_CODES |  | ||||||
| from .. import about | from .. import about | ||||||
|  | from ..errors import RENAMED_LANGUAGE_CODES | ||||||
|  | from ..schemas import ProjectConfigSchema, validate | ||||||
|  | from ..util import ( | ||||||
|  |     ENV_VARS, | ||||||
|  |     SimpleFrozenDict, | ||||||
|  |     import_file, | ||||||
|  |     is_compatible_version, | ||||||
|  |     logger, | ||||||
|  |     make_tempdir, | ||||||
|  |     registry, | ||||||
|  |     run_command, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from pathy import FluidPath  # noqa: F401 |     from pathy import FluidPath  # noqa: F401 | ||||||
|  |  | ||||||
|  | @ -1,18 +1,15 @@ | ||||||
| import tqdm |  | ||||||
| import srsly |  | ||||||
| 
 |  | ||||||
| from itertools import chain | from itertools import chain | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import Optional, List, Iterable, cast, Union | from typing import Iterable, List, Optional, Union, cast | ||||||
| 
 | 
 | ||||||
|  | import srsly | ||||||
|  | import tqdm | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory |  | ||||||
| 
 |  | ||||||
| from ..tokens import Doc, DocBin | from ..tokens import Doc, DocBin | ||||||
| from ..vocab import Vocab |  | ||||||
| from ..util import ensure_path, load_model | from ..util import ensure_path, load_model | ||||||
| 
 | from ..vocab import Vocab | ||||||
|  | from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory | ||||||
| 
 | 
 | ||||||
| path_help = """Location of the documents to predict on. | path_help = """Location of the documents to predict on. | ||||||
| Can be a single file in .spacy format or a .jsonl file. | Can be a single file in .spacy format or a .jsonl file. | ||||||
|  |  | ||||||
|  | @ -1,13 +1,20 @@ | ||||||
| from typing import Optional |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg |  | ||||||
| import typer |  | ||||||
| import logging | import logging | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Optional | ||||||
|  | 
 | ||||||
|  | import typer | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error |  | ||||||
| from ._util import import_code_paths |  | ||||||
| from .. import util | from .. import util | ||||||
| from ..util import get_sourced_components, load_model_from_config | from ..util import get_sourced_components, load_model_from_config | ||||||
|  | from ._util import ( | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     app, | ||||||
|  |     import_code_paths, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     show_validation_error, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command( | @app.command( | ||||||
|  |  | ||||||
|  | @ -1,11 +1,12 @@ | ||||||
| from typing import Iterable, List, Optional |  | ||||||
| import random | import random | ||||||
| from itertools import islice |  | ||||||
| import numpy |  | ||||||
| from pathlib import Path |  | ||||||
| import time | import time | ||||||
| from tqdm import tqdm | from itertools import islice | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Iterable, List, Optional | ||||||
|  | 
 | ||||||
|  | import numpy | ||||||
| import typer | import typer | ||||||
|  | from tqdm import tqdm | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from .. import util | from .. import util | ||||||
|  |  | ||||||
|  | @ -1,18 +1,22 @@ | ||||||
| from typing import Callable, Iterable, Mapping, Optional, Any, Union | import itertools | ||||||
| from enum import Enum |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import Printer |  | ||||||
| import srsly |  | ||||||
| import re | import re | ||||||
| import sys | import sys | ||||||
| import itertools | from enum import Enum | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Callable, Iterable, Mapping, Optional, Union | ||||||
|  | 
 | ||||||
|  | import srsly | ||||||
|  | from wasabi import Printer | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory |  | ||||||
| from ..training import docs_to_json |  | ||||||
| from ..tokens import Doc, DocBin | from ..tokens import Doc, DocBin | ||||||
| from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs | from ..training import docs_to_json | ||||||
| from ..training.converters import conllu_to_docs | from ..training.converters import ( | ||||||
| 
 |     conll_ner_to_docs, | ||||||
|  |     conllu_to_docs, | ||||||
|  |     iob_to_docs, | ||||||
|  |     json_to_docs, | ||||||
|  | ) | ||||||
|  | from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory | ||||||
| 
 | 
 | ||||||
| # Converters are matched by file extension except for ner/iob, which are | # Converters are matched by file extension except for ner/iob, which are | ||||||
| # matched by file extension and content. To add a converter, add a new | # matched by file extension and content. To add a converter, add a new | ||||||
|  |  | ||||||
|  | @ -1,15 +1,22 @@ | ||||||
| from typing import Optional, Dict, Any, Union, List |  | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg, table | from typing import Any, Dict, List, Optional, Union | ||||||
|  | 
 | ||||||
|  | import typer | ||||||
| from thinc.api import Config | from thinc.api import Config | ||||||
| from thinc.config import VARIABLE_RE | from thinc.config import VARIABLE_RE | ||||||
| import typer | from wasabi import msg, table | ||||||
| 
 | 
 | ||||||
| from ._util import Arg, Opt, show_validation_error, parse_config_overrides | from .. import util | ||||||
| from ._util import import_code_paths, debug_cli |  | ||||||
| from ..schemas import ConfigSchemaInit, ConfigSchemaTraining | from ..schemas import ConfigSchemaInit, ConfigSchemaTraining | ||||||
| from ..util import registry | from ..util import registry | ||||||
| from .. import util | from ._util import ( | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     debug_cli, | ||||||
|  |     import_code_paths, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     show_validation_error, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @debug_cli.command( | @debug_cli.command( | ||||||
|  |  | ||||||
|  | @ -1,29 +1,49 @@ | ||||||
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union |  | ||||||
| from typing import Literal, cast, overload |  | ||||||
| from pathlib import Path |  | ||||||
| from collections import Counter |  | ||||||
| import sys |  | ||||||
| import srsly |  | ||||||
| from wasabi import Printer, MESSAGES, msg |  | ||||||
| import typer |  | ||||||
| import math | import math | ||||||
|  | import sys | ||||||
|  | from collections import Counter | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import ( | ||||||
|  |     Any, | ||||||
|  |     Dict, | ||||||
|  |     Iterable, | ||||||
|  |     List, | ||||||
|  |     Literal, | ||||||
|  |     Optional, | ||||||
|  |     Sequence, | ||||||
|  |     Set, | ||||||
|  |     Tuple, | ||||||
|  |     Union, | ||||||
|  |     cast, | ||||||
|  |     overload, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides | import numpy | ||||||
| from ._util import import_code_paths, debug_cli, _format_number | import srsly | ||||||
| from ..training import Example, remove_bilu_prefix | import typer | ||||||
| from ..training.initialize import get_sourced_components | from wasabi import MESSAGES, Printer, msg | ||||||
| from ..schemas import ConfigSchemaTraining | 
 | ||||||
| from ..pipeline import TrainablePipe | from .. import util | ||||||
|  | from ..language import Language | ||||||
|  | from ..morphology import Morphology | ||||||
|  | from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe | ||||||
|  | from ..pipeline._edit_tree_internals.edit_trees import EditTrees | ||||||
| from ..pipeline._parser_internals import nonproj | from ..pipeline._parser_internals import nonproj | ||||||
| from ..pipeline._parser_internals.nonproj import DELIMITER | from ..pipeline._parser_internals.nonproj import DELIMITER | ||||||
| from ..pipeline import Morphologizer, SpanCategorizer | from ..schemas import ConfigSchemaTraining | ||||||
| from ..pipeline._edit_tree_internals.edit_trees import EditTrees | from ..training import Example, remove_bilu_prefix | ||||||
| from ..morphology import Morphology | from ..training.initialize import get_sourced_components | ||||||
| from ..language import Language |  | ||||||
| from ..util import registry, resolve_dot_names | from ..util import registry, resolve_dot_names | ||||||
| from ..vectors import Mode as VectorsMode | from ..vectors import Mode as VectorsMode | ||||||
| from .. import util | from ._util import ( | ||||||
| 
 |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     _format_number, | ||||||
|  |     app, | ||||||
|  |     debug_cli, | ||||||
|  |     import_code_paths, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     show_validation_error, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| # Minimum number of expected occurrences of NER label in data to train new label | # Minimum number of expected occurrences of NER label in data to train new label | ||||||
| NEW_LABEL_THRESHOLD = 50 | NEW_LABEL_THRESHOLD = 50 | ||||||
|  | @ -210,7 +230,7 @@ def debug_data( | ||||||
|     else: |     else: | ||||||
|         msg.info("No word vectors present in the package") |         msg.info("No word vectors present in the package") | ||||||
| 
 | 
 | ||||||
|     if "spancat" in factory_names: |     if "spancat" in factory_names or "spancat_singlelabel" in factory_names: | ||||||
|         model_labels_spancat = _get_labels_from_spancat(nlp) |         model_labels_spancat = _get_labels_from_spancat(nlp) | ||||||
|         has_low_data_warning = False |         has_low_data_warning = False | ||||||
|         has_no_neg_warning = False |         has_no_neg_warning = False | ||||||
|  | @ -335,7 +355,7 @@ def debug_data( | ||||||
|                 show=verbose, |                 show=verbose, | ||||||
|             ) |             ) | ||||||
|         else: |         else: | ||||||
|             msg.good("Examples without ocurrences available for all labels") |             msg.good("Examples without occurrences available for all labels") | ||||||
| 
 | 
 | ||||||
|     if "ner" in factory_names: |     if "ner" in factory_names: | ||||||
|         # Get all unique NER labels present in the data |         # Get all unique NER labels present in the data | ||||||
|  | @ -520,9 +540,13 @@ def debug_data( | ||||||
| 
 | 
 | ||||||
|     if "tagger" in factory_names: |     if "tagger" in factory_names: | ||||||
|         msg.divider("Part-of-speech Tagging") |         msg.divider("Part-of-speech Tagging") | ||||||
|         label_list = [label for label in gold_train_data["tags"]] |         label_list, counts = zip(*gold_train_data["tags"].items()) | ||||||
|         model_labels = _get_labels_from_model(nlp, "tagger") |  | ||||||
|         msg.info(f"{len(label_list)} label(s) in train data") |         msg.info(f"{len(label_list)} label(s) in train data") | ||||||
|  |         p = numpy.array(counts) | ||||||
|  |         p = p / p.sum() | ||||||
|  |         norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list)) | ||||||
|  |         msg.info(f"{norm_entropy} is the normalised label entropy") | ||||||
|  |         model_labels = _get_labels_from_model(nlp, "tagger") | ||||||
|         labels = set(label_list) |         labels = set(label_list) | ||||||
|         missing_labels = model_labels - labels |         missing_labels = model_labels - labels | ||||||
|         if missing_labels: |         if missing_labels: | ||||||
|  | @ -824,7 +848,7 @@ def _compile_gold( | ||||||
|                     data["boundary_cross_ents"] += 1 |                     data["boundary_cross_ents"] += 1 | ||||||
|                 elif label == "-": |                 elif label == "-": | ||||||
|                     data["ner"]["-"] += 1 |                     data["ner"]["-"] += 1 | ||||||
|         if "spancat" in factory_names: |         if "spancat" in factory_names or "spancat_singlelabel" in factory_names: | ||||||
|             for spans_key in list(eg.reference.spans.keys()): |             for spans_key in list(eg.reference.spans.keys()): | ||||||
|                 # Obtain the span frequency |                 # Obtain the span frequency | ||||||
|                 if spans_key not in data["spancat"]: |                 if spans_key not in data["spancat"]: | ||||||
|  | @ -1022,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: | ||||||
|     pipe_names = [ |     pipe_names = [ | ||||||
|         pipe_name |         pipe_name | ||||||
|         for pipe_name in nlp.pipe_names |         for pipe_name in nlp.pipe_names | ||||||
|         if nlp.get_pipe_meta(pipe_name).factory == "spancat" |         if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel") | ||||||
|     ] |     ] | ||||||
|     labels: Dict[str, Set[str]] = {} |     labels: Dict[str, Set[str]] = {} | ||||||
|     for pipe_name in pipe_names: |     for pipe_name in pipe_names: | ||||||
|  |  | ||||||
|  | @ -1,13 +1,13 @@ | ||||||
|  | from pathlib import Path | ||||||
| from typing import Optional | from typing import Optional | ||||||
| 
 | 
 | ||||||
| import typer | import typer | ||||||
| from wasabi import Printer, diff_strings, MarkdownRenderer |  | ||||||
| from pathlib import Path |  | ||||||
| from thinc.api import Config | from thinc.api import Config | ||||||
|  | from wasabi import MarkdownRenderer, Printer, diff_strings | ||||||
| 
 | 
 | ||||||
| from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides |  | ||||||
| from ..util import load_config | from ..util import load_config | ||||||
| from .init_config import init_config, Optimizations | from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error | ||||||
|  | from .init_config import Optimizations, init_config | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @debug_cli.command( | @debug_cli.command( | ||||||
|  |  | ||||||
|  | @ -1,19 +1,32 @@ | ||||||
| from typing import Dict, Any, Optional |  | ||||||
| from pathlib import Path |  | ||||||
| import itertools | import itertools | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, Optional | ||||||
|  | 
 | ||||||
|  | import typer | ||||||
|  | from thinc.api import ( | ||||||
|  |     Model, | ||||||
|  |     data_validation, | ||||||
|  |     fix_random_seed, | ||||||
|  |     set_dropout_rate, | ||||||
|  |     set_gpu_allocator, | ||||||
|  | ) | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
| from spacy.util import resolve_dot_names | from spacy.util import resolve_dot_names | ||||||
| from wasabi import msg |  | ||||||
| from thinc.api import fix_random_seed, set_dropout_rate |  | ||||||
| from thinc.api import Model, data_validation, set_gpu_allocator |  | ||||||
| import typer |  | ||||||
| 
 | 
 | ||||||
| from ._util import Arg, Opt, debug_cli, show_validation_error | from .. import util | ||||||
| from ._util import parse_config_overrides, string_to_list, setup_gpu |  | ||||||
| from ..schemas import ConfigSchemaTraining | from ..schemas import ConfigSchemaTraining | ||||||
| from ..util import registry | from ..util import registry | ||||||
| from .. import util | from ._util import ( | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     debug_cli, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     setup_gpu, | ||||||
|  |     show_validation_error, | ||||||
|  |     string_to_list, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @debug_cli.command( | @debug_cli.command( | ||||||
|  |  | ||||||
|  | @ -1,14 +1,20 @@ | ||||||
| from typing import Optional, Sequence |  | ||||||
| import requests |  | ||||||
| import sys | import sys | ||||||
| from wasabi import msg | from typing import Optional, Sequence | ||||||
| import typer | 
 | ||||||
|  | import requests | ||||||
|  | import typer | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX |  | ||||||
| from .. import about | from .. import about | ||||||
| from ..util import is_package, get_minor_version, run_command | from ..util import ( | ||||||
| from ..util import is_prerelease_version, get_installed_models |     get_installed_models, | ||||||
| from ..util import get_package_version |     get_minor_version, | ||||||
|  |     get_package_version, | ||||||
|  |     is_package, | ||||||
|  |     is_prerelease_version, | ||||||
|  |     run_command, | ||||||
|  | ) | ||||||
|  | from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command( | @app.command( | ||||||
|  | @ -83,11 +89,8 @@ def download( | ||||||
| 
 | 
 | ||||||
| def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | ||||||
|     dl_tpl = "{m}-{v}/{m}-{v}{s}" |     dl_tpl = "{m}-{v}/{m}-{v}{s}" | ||||||
|     egg_tpl = "#egg={m}=={v}" |  | ||||||
|     suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX |     suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX | ||||||
|     filename = dl_tpl.format(m=model_name, v=version, s=suffix) |     filename = dl_tpl.format(m=model_name, v=version, s=suffix) | ||||||
|     if sdist: |  | ||||||
|         filename += egg_tpl.format(m=model_name, v=version) |  | ||||||
|     return filename |     return filename | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,16 +1,16 @@ | ||||||
| from typing import Optional, List, Dict, Any, Union |  | ||||||
| from wasabi import Printer |  | ||||||
| from pathlib import Path |  | ||||||
| import re | import re | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, List, Optional, Union | ||||||
|  | 
 | ||||||
| import srsly | import srsly | ||||||
| from thinc.api import fix_random_seed | from thinc.api import fix_random_seed | ||||||
|  | from wasabi import Printer | ||||||
| 
 | 
 | ||||||
| from ..training import Corpus | from .. import displacy, util | ||||||
| from ..tokens import Doc |  | ||||||
| from ._util import app, Arg, Opt, setup_gpu, import_code_paths, benchmark_cli |  | ||||||
| from ..scorer import Scorer | from ..scorer import Scorer | ||||||
| from .. import util | from ..tokens import Doc | ||||||
| from .. import displacy | from ..training import Corpus | ||||||
|  | from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @benchmark_cli.command( | @benchmark_cli.command( | ||||||
|  | @ -27,6 +27,7 @@ def evaluate_cli( | ||||||
|     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), |     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), | ||||||
|     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), |     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), | ||||||
|     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), |     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), | ||||||
|  |     per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  | @ -50,6 +51,7 @@ def evaluate_cli( | ||||||
|         gold_preproc=gold_preproc, |         gold_preproc=gold_preproc, | ||||||
|         displacy_path=displacy_path, |         displacy_path=displacy_path, | ||||||
|         displacy_limit=displacy_limit, |         displacy_limit=displacy_limit, | ||||||
|  |         per_component=per_component, | ||||||
|         silent=False, |         silent=False, | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  | @ -64,6 +66,7 @@ def evaluate( | ||||||
|     displacy_limit: int = 25, |     displacy_limit: int = 25, | ||||||
|     silent: bool = True, |     silent: bool = True, | ||||||
|     spans_key: str = "sc", |     spans_key: str = "sc", | ||||||
|  |     per_component: bool = False, | ||||||
| ) -> Dict[str, Any]: | ) -> Dict[str, Any]: | ||||||
|     msg = Printer(no_print=silent, pretty=not silent) |     msg = Printer(no_print=silent, pretty=not silent) | ||||||
|     fix_random_seed() |     fix_random_seed() | ||||||
|  | @ -78,7 +81,16 @@ def evaluate( | ||||||
|     corpus = Corpus(data_path, gold_preproc=gold_preproc) |     corpus = Corpus(data_path, gold_preproc=gold_preproc) | ||||||
|     nlp = util.load_model(model) |     nlp = util.load_model(model) | ||||||
|     dev_dataset = list(corpus(nlp)) |     dev_dataset = list(corpus(nlp)) | ||||||
|     scores = nlp.evaluate(dev_dataset) |     scores = nlp.evaluate(dev_dataset, per_component=per_component) | ||||||
|  |     if per_component: | ||||||
|  |         data = scores | ||||||
|  |         if output is None: | ||||||
|  |             msg.warn( | ||||||
|  |                 "The per-component option is enabled but there is no output JSON file provided to save the scores to." | ||||||
|  |             ) | ||||||
|  |         else: | ||||||
|  |             msg.info("Per-component scores will be saved to output JSON file.") | ||||||
|  |     else: | ||||||
|         metrics = { |         metrics = { | ||||||
|             "TOK": "token_acc", |             "TOK": "token_acc", | ||||||
|             "TAG": "tag_acc", |             "TAG": "tag_acc", | ||||||
|  | @ -122,6 +134,8 @@ def evaluate( | ||||||
|         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) |         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) | ||||||
|         render_deps = "parser" in factory_names |         render_deps = "parser" in factory_names | ||||||
|         render_ents = "ner" in factory_names |         render_ents = "ner" in factory_names | ||||||
|  |         render_spans = "spancat" in factory_names | ||||||
|  | 
 | ||||||
|         render_parses( |         render_parses( | ||||||
|             docs, |             docs, | ||||||
|             displacy_path, |             displacy_path, | ||||||
|  | @ -129,6 +143,7 @@ def evaluate( | ||||||
|             limit=displacy_limit, |             limit=displacy_limit, | ||||||
|             deps=render_deps, |             deps=render_deps, | ||||||
|             ents=render_ents, |             ents=render_ents, | ||||||
|  |             spans=render_spans, | ||||||
|         ) |         ) | ||||||
|         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) |         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) | ||||||
| 
 | 
 | ||||||
|  | @ -182,6 +197,7 @@ def render_parses( | ||||||
|     limit: int = 250, |     limit: int = 250, | ||||||
|     deps: bool = True, |     deps: bool = True, | ||||||
|     ents: bool = True, |     ents: bool = True, | ||||||
|  |     spans: bool = True, | ||||||
| ): | ): | ||||||
|     docs[0].user_data["title"] = model_name |     docs[0].user_data["title"] = model_name | ||||||
|     if ents: |     if ents: | ||||||
|  | @ -195,6 +211,11 @@ def render_parses( | ||||||
|         with (output_path / "parses.html").open("w", encoding="utf8") as file_: |         with (output_path / "parses.html").open("w", encoding="utf8") as file_: | ||||||
|             file_.write(html) |             file_.write(html) | ||||||
| 
 | 
 | ||||||
|  |     if spans: | ||||||
|  |         html = displacy.render(docs[:limit], style="span", page=True) | ||||||
|  |         with (output_path / "spans.html").open("w", encoding="utf8") as file_: | ||||||
|  |             file_.write(html) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def print_prf_per_type( | def print_prf_per_type( | ||||||
|     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str |     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str | ||||||
|  |  | ||||||
|  | @ -1,17 +1,17 @@ | ||||||
| import functools | import functools | ||||||
|  | import logging | ||||||
| import operator | import operator | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import logging | from typing import Any, Dict, List, Optional, Tuple | ||||||
| from typing import Optional, Tuple, Any, Dict, List |  | ||||||
| 
 | 
 | ||||||
| import numpy | import numpy | ||||||
| import wasabi.tables | import wasabi.tables | ||||||
| 
 | 
 | ||||||
| from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer |  | ||||||
| from ..errors import Errors |  | ||||||
| from ..training import Corpus |  | ||||||
| from ._util import app, Arg, Opt, import_code, setup_gpu |  | ||||||
| from .. import util | from .. import util | ||||||
|  | from ..errors import Errors | ||||||
|  | from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer | ||||||
|  | from ..training import Corpus | ||||||
|  | from ._util import Arg, Opt, app, import_code, setup_gpu | ||||||
| 
 | 
 | ||||||
| _DEFAULTS = { | _DEFAULTS = { | ||||||
|     "n_trials": 11, |     "n_trials": 11, | ||||||
|  | @ -35,7 +35,7 @@ def find_threshold_cli( | ||||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), |     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||||
|     use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), |     use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||||
|     gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), |     gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), | ||||||
|     verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), |     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|  | @ -1,15 +1,15 @@ | ||||||
| from typing import Optional, Dict, Any, Union, List |  | ||||||
| import platform |  | ||||||
| import json |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import Printer, MarkdownRenderer |  | ||||||
| import srsly |  | ||||||
| import importlib.metadata | import importlib.metadata | ||||||
|  | import json | ||||||
|  | import platform | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, List, Optional, Union | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, string_to_list | import srsly | ||||||
| from .download import get_model_filename, get_latest_version | from wasabi import MarkdownRenderer, Printer | ||||||
| from .. import util | 
 | ||||||
| from .. import about | from .. import about, util | ||||||
|  | from ._util import Arg, Opt, app, string_to_list | ||||||
|  | from .download import get_latest_version, get_model_filename | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command("info") | @app.command("info") | ||||||
|  |  | ||||||
|  | @ -1,19 +1,27 @@ | ||||||
| from typing import Optional, List, Tuple | import re | ||||||
| from enum import Enum | from enum import Enum | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import Printer, diff_strings | from typing import List, Optional, Tuple | ||||||
| from thinc.api import Config | 
 | ||||||
| import srsly | import srsly | ||||||
| import re |  | ||||||
| from jinja2 import Template | from jinja2 import Template | ||||||
|  | from thinc.api import Config | ||||||
|  | from wasabi import Printer, diff_strings | ||||||
| 
 | 
 | ||||||
| from .. import util | from .. import util | ||||||
| from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH | from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH | ||||||
| from ..schemas import RecommendationSchema | from ..schemas import RecommendationSchema | ||||||
| from ..util import SimpleFrozenList | from ..util import SimpleFrozenList | ||||||
| from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND | from ._util import ( | ||||||
| from ._util import string_to_list, import_code, _handle_renamed_language_codes |     COMMAND, | ||||||
| 
 |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     _handle_renamed_language_codes, | ||||||
|  |     import_code, | ||||||
|  |     init_cli, | ||||||
|  |     show_validation_error, | ||||||
|  |     string_to_list, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| ROOT = Path(__file__).parent / "templates" | ROOT = Path(__file__).parent / "templates" | ||||||
| TEMPLATE_PATH = ROOT / "quickstart_training.jinja" | TEMPLATE_PATH = ROOT / "quickstart_training.jinja" | ||||||
|  |  | ||||||
|  | @ -1,15 +1,24 @@ | ||||||
| from typing import Optional |  | ||||||
| import logging | import logging | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg | from typing import Optional | ||||||
| import typer | 
 | ||||||
| import srsly | import srsly | ||||||
|  | import typer | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from .. import util | from .. import util | ||||||
| from ..training.initialize import init_nlp, convert_vectors |  | ||||||
| from ..language import Language | from ..language import Language | ||||||
| from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error | from ..training.initialize import convert_vectors, init_nlp | ||||||
| from ._util import import_code, setup_gpu, _handle_renamed_language_codes | from ._util import ( | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     _handle_renamed_language_codes, | ||||||
|  |     import_code, | ||||||
|  |     init_cli, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     setup_gpu, | ||||||
|  |     show_validation_error, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @init_cli.command("vectors") | @init_cli.command("vectors") | ||||||
|  |  | ||||||
|  | @ -1,18 +1,18 @@ | ||||||
| from typing import Optional, Union, Any, Dict, List, Tuple, cast |  | ||||||
| import shutil |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import Printer, MarkdownRenderer, get_raw_input |  | ||||||
| from thinc.api import Config |  | ||||||
| from collections import defaultdict |  | ||||||
| from catalogue import RegistryError |  | ||||||
| import srsly |  | ||||||
| import sys |  | ||||||
| import re | import re | ||||||
|  | import shutil | ||||||
|  | import sys | ||||||
|  | from collections import defaultdict | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, List, Optional, Tuple, Union, cast | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX | import srsly | ||||||
| from ..schemas import validate, ModelMetaSchema | from catalogue import RegistryError | ||||||
| from .. import util | from thinc.api import Config | ||||||
| from .. import about | from wasabi import MarkdownRenderer, Printer, get_raw_input | ||||||
|  | 
 | ||||||
|  | from .. import about, util | ||||||
|  | from ..schemas import ModelMetaSchema, validate | ||||||
|  | from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command("package") | @app.command("package") | ||||||
|  |  | ||||||
|  | @ -1,13 +1,21 @@ | ||||||
| from typing import Optional |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg |  | ||||||
| import typer |  | ||||||
| import re | import re | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Optional | ||||||
|  | 
 | ||||||
|  | import typer | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error |  | ||||||
| from ._util import import_code_paths, setup_gpu |  | ||||||
| from ..training.pretrain import pretrain | from ..training.pretrain import pretrain | ||||||
| from ..util import load_config | from ..util import load_config | ||||||
|  | from ._util import ( | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     app, | ||||||
|  |     import_code_paths, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     setup_gpu, | ||||||
|  |     show_validation_error, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command( | @app.command( | ||||||
|  | @ -23,6 +31,7 @@ def pretrain_cli( | ||||||
|     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), |     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), | ||||||
|     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), |     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), | ||||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), |     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||||
|  |     skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"), | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|  | @ -74,6 +83,7 @@ def pretrain_cli( | ||||||
|         epoch_resume=epoch_resume, |         epoch_resume=epoch_resume, | ||||||
|         use_gpu=use_gpu, |         use_gpu=use_gpu, | ||||||
|         silent=False, |         silent=False, | ||||||
|  |         skip_last=skip_last, | ||||||
|     ) |     ) | ||||||
|     msg.good("Successfully finished pretrain") |     msg.good("Successfully finished pretrain") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,17 +1,18 @@ | ||||||
| from typing import Optional, Sequence, Union, Iterator |  | ||||||
| import tqdm |  | ||||||
| from pathlib import Path |  | ||||||
| import srsly |  | ||||||
| import cProfile | import cProfile | ||||||
|  | import itertools | ||||||
| import pstats | import pstats | ||||||
| import sys | import sys | ||||||
| import itertools | from pathlib import Path | ||||||
| from wasabi import msg, Printer | from typing import Iterator, Optional, Sequence, Union | ||||||
| import typer | 
 | ||||||
|  | import srsly | ||||||
|  | import tqdm | ||||||
|  | import typer | ||||||
|  | from wasabi import Printer, msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, debug_cli, Arg, Opt, NAME |  | ||||||
| from ..language import Language | from ..language import Language | ||||||
| from ..util import load_model | from ..util import load_model | ||||||
|  | from ._util import NAME, Arg, Opt, app, debug_cli | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @debug_cli.command("profile") | @debug_cli.command("profile") | ||||||
|  |  | ||||||
|  | @ -1,16 +1,27 @@ | ||||||
| from typing import Any, Dict, Optional |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg |  | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import shutil | import shutil | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, Optional | ||||||
|  | 
 | ||||||
| import requests | import requests | ||||||
| import typer | import typer | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ...util import ensure_path, working_dir | from ...util import ensure_path, working_dir | ||||||
| from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config | from .._util import ( | ||||||
| from .._util import get_checksum, download_file, git_checkout, get_git_version |     PROJECT_FILE, | ||||||
| from .._util import SimpleFrozenDict, parse_config_overrides |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     SimpleFrozenDict, | ||||||
|  |     download_file, | ||||||
|  |     get_checksum, | ||||||
|  |     get_git_version, | ||||||
|  |     git_checkout, | ||||||
|  |     load_project_config, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     project_cli, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| # Whether assets are extra if `extra` is not set. | # Whether assets are extra if `extra` is not set. | ||||||
| EXTRA_DEFAULT = False | EXTRA_DEFAULT = False | ||||||
|  |  | ||||||
|  | @ -1,13 +1,22 @@ | ||||||
| from typing import Optional |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg |  | ||||||
| import subprocess |  | ||||||
| import re | import re | ||||||
|  | import subprocess | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Optional | ||||||
|  | 
 | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ... import about | from ... import about | ||||||
| from ...util import ensure_path | from ...util import ensure_path | ||||||
| from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE | from .._util import ( | ||||||
| from .._util import git_checkout, get_git_version, git_repo_branch_exists |     COMMAND, | ||||||
|  |     PROJECT_FILE, | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     get_git_version, | ||||||
|  |     git_checkout, | ||||||
|  |     git_repo_branch_exists, | ||||||
|  |     project_cli, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| DEFAULT_REPO = about.__projects__ | DEFAULT_REPO = about.__projects__ | ||||||
| DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ | DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from wasabi import msg, MarkdownRenderer | 
 | ||||||
|  | from wasabi import MarkdownRenderer, msg | ||||||
| 
 | 
 | ||||||
| from ...util import working_dir | from ...util import working_dir | ||||||
| from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config | from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| DOCS_URL = "https://spacy.io" | DOCS_URL = "https://spacy.io" | ||||||
| INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the | INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the | ||||||
|  |  | ||||||
|  | @ -1,15 +1,28 @@ | ||||||
| """This module contains helpers and subcommands for integrating spaCy projects | """This module contains helpers and subcommands for integrating spaCy projects | ||||||
| with Data Version Controk (DVC). https://dvc.org""" | with Data Version Controk (DVC). https://dvc.org""" | ||||||
| from typing import Dict, Any, List, Optional, Iterable |  | ||||||
| import subprocess | import subprocess | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | from typing import Any, Dict, Iterable, List, Optional | ||||||
|  | 
 | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli | from ...util import ( | ||||||
| from .._util import Arg, Opt, NAME, COMMAND |     SimpleFrozenList, | ||||||
| from ...util import working_dir, split_command, join_command, run_command |     join_command, | ||||||
| from ...util import SimpleFrozenList |     run_command, | ||||||
| 
 |     split_command, | ||||||
|  |     working_dir, | ||||||
|  | ) | ||||||
|  | from .._util import ( | ||||||
|  |     COMMAND, | ||||||
|  |     NAME, | ||||||
|  |     PROJECT_FILE, | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     get_hash, | ||||||
|  |     load_project_config, | ||||||
|  |     project_cli, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| DVC_CONFIG = "dvc.yaml" | DVC_CONFIG = "dvc.yaml" | ||||||
| DVC_DIR = ".dvc" | DVC_DIR = ".dvc" | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | 
 | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| from .remote_storage import RemoteStorage | 
 | ||||||
| from .remote_storage import get_command_hash | from .._util import Arg, load_project_config, logger, project_cli | ||||||
| from .._util import project_cli, Arg, logger | from .remote_storage import RemoteStorage, get_command_hash | ||||||
| from .._util import load_project_config |  | ||||||
| from .run import update_lockfile | from .run import update_lockfile | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | 
 | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| from .remote_storage import RemoteStorage | 
 | ||||||
| from .remote_storage import get_content_hash, get_command_hash | from .._util import Arg, load_project_config, logger, project_cli | ||||||
| from .._util import load_project_config | from .remote_storage import RemoteStorage, get_command_hash, get_content_hash | ||||||
| from .._util import project_cli, Arg, logger |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @project_cli.command("push") | @project_cli.command("push") | ||||||
|  |  | ||||||
|  | @ -1,18 +1,25 @@ | ||||||
| from typing import Optional, List, Dict, TYPE_CHECKING | import hashlib | ||||||
| import os | import os | ||||||
| import site | import site | ||||||
| import hashlib |  | ||||||
| import urllib.parse |  | ||||||
| import tarfile | import tarfile | ||||||
|  | import urllib.parse | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | from typing import TYPE_CHECKING, Dict, List, Optional | ||||||
|  | 
 | ||||||
| from wasabi import msg | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from .._util import get_hash, get_checksum, upload_file, download_file |  | ||||||
| from .._util import ensure_pathy, make_tempdir |  | ||||||
| from ...util import get_minor_version, ENV_VARS, check_bool_env_var |  | ||||||
| from ...git_info import GIT_VERSION |  | ||||||
| from ... import about | from ... import about | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
|  | from ...git_info import GIT_VERSION | ||||||
|  | from ...util import ENV_VARS, check_bool_env_var, get_minor_version | ||||||
|  | from .._util import ( | ||||||
|  |     download_file, | ||||||
|  |     ensure_pathy, | ||||||
|  |     get_checksum, | ||||||
|  |     get_hash, | ||||||
|  |     make_tempdir, | ||||||
|  |     upload_file, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from pathy import FluidPath  # noqa: F401 |     from pathy import FluidPath  # noqa: F401 | ||||||
|  |  | ||||||
|  | @ -1,20 +1,39 @@ | ||||||
| from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple |  | ||||||
| import os.path | import os.path | ||||||
| from pathlib import Path |  | ||||||
| 
 |  | ||||||
| from wasabi import msg |  | ||||||
| from wasabi.util import locale_escape |  | ||||||
| import sys | import sys | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple | ||||||
|  | 
 | ||||||
| import srsly | import srsly | ||||||
| import typer | import typer | ||||||
|  | from wasabi import msg | ||||||
|  | from wasabi.util import locale_escape | ||||||
| 
 | 
 | ||||||
| from ... import about | from ... import about | ||||||
| from ...git_info import GIT_VERSION | from ...git_info import GIT_VERSION | ||||||
| from ...util import working_dir, run_command, split_command, is_cwd, join_command | from ...util import ( | ||||||
| from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS |     ENV_VARS, | ||||||
| from ...util import check_bool_env_var, SimpleFrozenDict |     SimpleFrozenDict, | ||||||
| from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash |     SimpleFrozenList, | ||||||
| from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides |     check_bool_env_var, | ||||||
|  |     is_cwd, | ||||||
|  |     is_minor_version_match, | ||||||
|  |     join_command, | ||||||
|  |     run_command, | ||||||
|  |     split_command, | ||||||
|  |     working_dir, | ||||||
|  | ) | ||||||
|  | from .._util import ( | ||||||
|  |     COMMAND, | ||||||
|  |     PROJECT_FILE, | ||||||
|  |     PROJECT_LOCK, | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     get_checksum, | ||||||
|  |     get_hash, | ||||||
|  |     load_project_config, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     project_cli, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @project_cli.command( | @project_cli.command( | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and | ||||||
| can help generate the best possible configuration, given a user's requirements. #} | can help generate the best possible configuration, given a user's requirements. #} | ||||||
| {%- set use_transformer = hardware != "cpu" and transformer_data -%} | {%- set use_transformer = hardware != "cpu" and transformer_data -%} | ||||||
| {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} | {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} | ||||||
| {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} | {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%} | ||||||
| [paths] | [paths] | ||||||
| train = null | train = null | ||||||
| dev = null | dev = null | ||||||
|  | @ -24,8 +24,11 @@ gpu_allocator = null | ||||||
| lang = "{{ lang }}" | lang = "{{ lang }}" | ||||||
| {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} | {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} | ||||||
| {%- set with_accuracy = optimize == "accuracy" -%} | {%- set with_accuracy = optimize == "accuracy" -%} | ||||||
| {%- set has_accurate_textcat = has_textcat and with_accuracy -%} | {# The BOW textcat doesn't need a source of features, so it can omit the | ||||||
| {%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} | tok2vec/transformer. #} | ||||||
|  | {%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} | ||||||
|  | {%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} | ||||||
|  | {%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} | ||||||
| {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} | {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} | ||||||
| {%- else -%} | {%- else -%} | ||||||
| {%- set full_pipeline = components -%} | {%- set full_pipeline = components -%} | ||||||
|  | @ -122,6 +125,30 @@ grad_factor = 1.0 | ||||||
| @layers = "reduce_mean.v1" | @layers = "reduce_mean.v1" | ||||||
| {% endif -%} | {% endif -%} | ||||||
| 
 | 
 | ||||||
|  | {% if "span_finder" in components -%} | ||||||
|  | [components.span_finder] | ||||||
|  | factory = "span_finder" | ||||||
|  | max_length = null | ||||||
|  | min_length = null | ||||||
|  | scorer = {"@scorers":"spacy.span_finder_scorer.v1"} | ||||||
|  | spans_key = "sc" | ||||||
|  | threshold = 0.5 | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model] | ||||||
|  | @architectures = "spacy.SpanFinder.v1" | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model.scorer] | ||||||
|  | @layers = "spacy.LinearLogistic.v1" | ||||||
|  | nO = 2 | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model.tok2vec] | ||||||
|  | @architectures = "spacy-transformers.TransformerListener.v1" | ||||||
|  | grad_factor = 1.0 | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model.tok2vec.pooling] | ||||||
|  | @layers = "reduce_mean.v1" | ||||||
|  | {% endif -%} | ||||||
|  | 
 | ||||||
| {% if "spancat" in components -%} | {% if "spancat" in components -%} | ||||||
| [components.spancat] | [components.spancat] | ||||||
| factory = "spancat" | factory = "spancat" | ||||||
|  | @ -154,6 +181,36 @@ grad_factor = 1.0 | ||||||
| sizes = [1,2,3] | sizes = [1,2,3] | ||||||
| {% endif -%} | {% endif -%} | ||||||
| 
 | 
 | ||||||
|  | {% if "spancat_singlelabel" in components %} | ||||||
|  | [components.spancat_singlelabel] | ||||||
|  | factory = "spancat_singlelabel" | ||||||
|  | negative_weight = 1.0 | ||||||
|  | allow_overlap = true | ||||||
|  | scorer = {"@scorers":"spacy.spancat_scorer.v1"} | ||||||
|  | spans_key = "sc" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model] | ||||||
|  | @architectures = "spacy.SpanCategorizer.v1" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.reducer] | ||||||
|  | @layers = "spacy.mean_max_reducer.v1" | ||||||
|  | hidden_size = 128 | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.scorer] | ||||||
|  | @layers = "Softmax.v2" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.tok2vec] | ||||||
|  | @architectures = "spacy-transformers.TransformerListener.v1" | ||||||
|  | grad_factor = 1.0 | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.tok2vec.pooling] | ||||||
|  | @layers = "reduce_mean.v1" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.suggester] | ||||||
|  | @misc = "spacy.ngram_suggester.v1" | ||||||
|  | sizes = [1,2,3] | ||||||
|  | {% endif %} | ||||||
|  | 
 | ||||||
| {% if "trainable_lemmatizer" in components -%} | {% if "trainable_lemmatizer" in components -%} | ||||||
| [components.trainable_lemmatizer] | [components.trainable_lemmatizer] | ||||||
| factory = "trainable_lemmatizer" | factory = "trainable_lemmatizer" | ||||||
|  | @ -219,10 +276,16 @@ no_output_layer = false | ||||||
| 
 | 
 | ||||||
| {% else -%} | {% else -%} | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatCNN.v2" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
| ngram_size = 1 | nO = null | ||||||
| no_output_layer = false | 
 | ||||||
|  | [components.textcat.model.tok2vec] | ||||||
|  | @architectures = "spacy-transformers.TransformerListener.v1" | ||||||
|  | grad_factor = 1.0 | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.tok2vec.pooling] | ||||||
|  | @layers = "reduce_mean.v1" | ||||||
| {%- endif %} | {%- endif %} | ||||||
| {%- endif %} | {%- endif %} | ||||||
| 
 | 
 | ||||||
|  | @ -250,10 +313,16 @@ no_output_layer = false | ||||||
| 
 | 
 | ||||||
| {% else -%} | {% else -%} | ||||||
| [components.textcat_multilabel.model] | [components.textcat_multilabel.model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatCNN.v2" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
| ngram_size = 1 | nO = null | ||||||
| no_output_layer = false | 
 | ||||||
|  | [components.textcat_multilabel.model.tok2vec] | ||||||
|  | @architectures = "spacy-transformers.TransformerListener.v1" | ||||||
|  | grad_factor = 1.0 | ||||||
|  | 
 | ||||||
|  | [components.textcat_multilabel.model.tok2vec.pooling] | ||||||
|  | @layers = "reduce_mean.v1" | ||||||
| {%- endif %} | {%- endif %} | ||||||
| {%- endif %} | {%- endif %} | ||||||
| 
 | 
 | ||||||
|  | @ -284,6 +353,7 @@ maxout_pieces = 3 | ||||||
| {% if "morphologizer" in components %} | {% if "morphologizer" in components %} | ||||||
| [components.morphologizer] | [components.morphologizer] | ||||||
| factory = "morphologizer" | factory = "morphologizer" | ||||||
|  | label_smoothing = 0.05 | ||||||
| 
 | 
 | ||||||
| [components.morphologizer.model] | [components.morphologizer.model] | ||||||
| @architectures = "spacy.Tagger.v2" | @architectures = "spacy.Tagger.v2" | ||||||
|  | @ -297,6 +367,7 @@ width = ${components.tok2vec.model.encode.width} | ||||||
| {% if "tagger" in components %} | {% if "tagger" in components %} | ||||||
| [components.tagger] | [components.tagger] | ||||||
| factory = "tagger" | factory = "tagger" | ||||||
|  | label_smoothing = 0.05 | ||||||
| 
 | 
 | ||||||
| [components.tagger.model] | [components.tagger.model] | ||||||
| @architectures = "spacy.Tagger.v2" | @architectures = "spacy.Tagger.v2" | ||||||
|  | @ -341,6 +412,27 @@ nO = null | ||||||
| width = ${components.tok2vec.model.encode.width} | width = ${components.tok2vec.model.encode.width} | ||||||
| {% endif %} | {% endif %} | ||||||
| 
 | 
 | ||||||
|  | {% if "span_finder" in components %} | ||||||
|  | [components.span_finder] | ||||||
|  | factory = "span_finder" | ||||||
|  | max_length = null | ||||||
|  | min_length = null | ||||||
|  | scorer = {"@scorers":"spacy.span_finder_scorer.v1"} | ||||||
|  | spans_key = "sc" | ||||||
|  | threshold = 0.5 | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model] | ||||||
|  | @architectures = "spacy.SpanFinder.v1" | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model.scorer] | ||||||
|  | @layers = "spacy.LinearLogistic.v1" | ||||||
|  | nO = 2 | ||||||
|  | 
 | ||||||
|  | [components.span_finder.model.tok2vec] | ||||||
|  | @architectures = "spacy.Tok2VecListener.v1" | ||||||
|  | width = ${components.tok2vec.model.encode.width} | ||||||
|  | {% endif %} | ||||||
|  | 
 | ||||||
| {% if "spancat" in components %} | {% if "spancat" in components %} | ||||||
| [components.spancat] | [components.spancat] | ||||||
| factory = "spancat" | factory = "spancat" | ||||||
|  | @ -370,6 +462,33 @@ width = ${components.tok2vec.model.encode.width} | ||||||
| sizes = [1,2,3] | sizes = [1,2,3] | ||||||
| {% endif %} | {% endif %} | ||||||
| 
 | 
 | ||||||
|  | {% if "spancat_singlelabel" in components %} | ||||||
|  | [components.spancat_singlelabel] | ||||||
|  | factory = "spancat_singlelabel" | ||||||
|  | negative_weight = 1.0 | ||||||
|  | allow_overlap = true | ||||||
|  | scorer = {"@scorers":"spacy.spancat_scorer.v1"} | ||||||
|  | spans_key = "sc" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model] | ||||||
|  | @architectures = "spacy.SpanCategorizer.v1" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.reducer] | ||||||
|  | @layers = "spacy.mean_max_reducer.v1" | ||||||
|  | hidden_size = 128 | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.scorer] | ||||||
|  | @layers = "Softmax.v2" | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.model.tok2vec] | ||||||
|  | @architectures = "spacy.Tok2VecListener.v1" | ||||||
|  | width = ${components.tok2vec.model.encode.width} | ||||||
|  | 
 | ||||||
|  | [components.spancat_singlelabel.suggester] | ||||||
|  | @misc = "spacy.ngram_suggester.v1" | ||||||
|  | sizes = [1,2,3] | ||||||
|  | {% endif %} | ||||||
|  | 
 | ||||||
| {% if "trainable_lemmatizer" in components -%} | {% if "trainable_lemmatizer" in components -%} | ||||||
| [components.trainable_lemmatizer] | [components.trainable_lemmatizer] | ||||||
| factory = "trainable_lemmatizer" | factory = "trainable_lemmatizer" | ||||||
|  |  | ||||||
|  | @ -1,15 +1,23 @@ | ||||||
| from typing import Optional, Dict, Any, Union |  | ||||||
| from pathlib import Path |  | ||||||
| from wasabi import msg |  | ||||||
| import typer |  | ||||||
| import logging | import logging | ||||||
| import sys | import sys | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Any, Dict, Optional, Union | ||||||
|  | 
 | ||||||
|  | import typer | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error |  | ||||||
| from ._util import import_code_paths, setup_gpu |  | ||||||
| from ..training.loop import train as train_nlp |  | ||||||
| from ..training.initialize import init_nlp |  | ||||||
| from .. import util | from .. import util | ||||||
|  | from ..training.initialize import init_nlp | ||||||
|  | from ..training.loop import train as train_nlp | ||||||
|  | from ._util import ( | ||||||
|  |     Arg, | ||||||
|  |     Opt, | ||||||
|  |     app, | ||||||
|  |     import_code_paths, | ||||||
|  |     parse_config_overrides, | ||||||
|  |     setup_gpu, | ||||||
|  |     show_validation_error, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command( | @app.command( | ||||||
|  |  | ||||||
|  | @ -1,14 +1,21 @@ | ||||||
| from typing import Tuple |  | ||||||
| from pathlib import Path |  | ||||||
| import sys | import sys | ||||||
| import requests |  | ||||||
| from wasabi import msg, Printer |  | ||||||
| import warnings | import warnings | ||||||
|  | from pathlib import Path | ||||||
|  | from typing import Tuple | ||||||
|  | 
 | ||||||
|  | import requests | ||||||
|  | from wasabi import Printer, msg | ||||||
| 
 | 
 | ||||||
| from ._util import app |  | ||||||
| from .. import about | from .. import about | ||||||
| from ..util import get_package_version, get_installed_models, get_minor_version | from ..util import ( | ||||||
| from ..util import get_package_path, get_model_meta, is_compatible_version |     get_installed_models, | ||||||
|  |     get_minor_version, | ||||||
|  |     get_model_meta, | ||||||
|  |     get_package_path, | ||||||
|  |     get_package_version, | ||||||
|  |     is_compatible_version, | ||||||
|  | ) | ||||||
|  | from ._util import app | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command("validate") | @app.command("validate") | ||||||
|  |  | ||||||
|  | @ -1,5 +1,6 @@ | ||||||
| """Helpers for Python and platform compatibility.""" | """Helpers for Python and platform compatibility.""" | ||||||
| import sys | import sys | ||||||
|  | 
 | ||||||
| from thinc.util import copy_array | from thinc.util import copy_array | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|  |  | ||||||
|  | @ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities. | ||||||
| DOCS: https://spacy.io/api/top-level#displacy | DOCS: https://spacy.io/api/top-level#displacy | ||||||
| USAGE: https://spacy.io/usage/visualizers | USAGE: https://spacy.io/usage/visualizers | ||||||
| """ | """ | ||||||
| from typing import Union, Iterable, Optional, Dict, Any, Callable |  | ||||||
| import warnings | import warnings | ||||||
|  | from typing import Any, Callable, Dict, Iterable, Optional, Union | ||||||
| 
 | 
 | ||||||
| from .render import DependencyRenderer, EntityRenderer, SpanRenderer |  | ||||||
| from ..tokens import Doc, Span |  | ||||||
| from ..errors import Errors, Warnings | from ..errors import Errors, Warnings | ||||||
| from ..util import is_in_jupyter | from ..tokens import Doc, Span | ||||||
| from ..util import find_available_port | from ..util import find_available_port, is_in_jupyter | ||||||
| 
 | from .render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||||
| 
 | 
 | ||||||
| _html = {} | _html = {} | ||||||
| RENDER_WRAPPER = None | RENDER_WRAPPER = None | ||||||
|  | @ -68,7 +66,7 @@ def render( | ||||||
|     if jupyter or (jupyter is None and is_in_jupyter()): |     if jupyter or (jupyter is None and is_in_jupyter()): | ||||||
|         # return HTML rendered by IPython display() |         # return HTML rendered by IPython display() | ||||||
|         # See #4840 for details on span wrapper to disable mathjax |         # See #4840 for details on span wrapper to disable mathjax | ||||||
|         from IPython.core.display import display, HTML |         from IPython.core.display import HTML, display | ||||||
| 
 | 
 | ||||||
|         return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html))) |         return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html))) | ||||||
|     return html |     return html | ||||||
|  | @ -125,13 +123,17 @@ def app(environ, start_response): | ||||||
|     return [res] |     return [res] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: | def parse_deps( | ||||||
|  |     orig_doc: Union[Doc, Span], options: Dict[str, Any] = {} | ||||||
|  | ) -> Dict[str, Any]: | ||||||
|     """Generate dependency parse in {'words': [], 'arcs': []} format. |     """Generate dependency parse in {'words': [], 'arcs': []} format. | ||||||
| 
 | 
 | ||||||
|     orig_doc (Doc): Document to parse. |     orig_doc (Union[Doc, Span]): Document to parse. | ||||||
|     options (Dict[str, Any]): Dependency parse specific visualisation options. |     options (Dict[str, Any]): Dependency parse specific visualisation options. | ||||||
|     RETURNS (dict): Generated dependency parse keyed by words and arcs. |     RETURNS (dict): Generated dependency parse keyed by words and arcs. | ||||||
|     """ |     """ | ||||||
|  |     if isinstance(orig_doc, Span): | ||||||
|  |         orig_doc = orig_doc.as_doc() | ||||||
|     doc = Doc(orig_doc.vocab).from_bytes( |     doc = Doc(orig_doc.vocab).from_bytes( | ||||||
|         orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) |         orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|  | @ -1,15 +1,29 @@ | ||||||
| from typing import Any, Dict, List, Optional, Tuple, Union |  | ||||||
| import uuid |  | ||||||
| import itertools | import itertools | ||||||
|  | import uuid | ||||||
|  | from typing import Any, Dict, List, Optional, Tuple, Union | ||||||
| 
 | 
 | ||||||
| from ..errors import Errors | from ..errors import Errors | ||||||
| from ..util import escape_html, minify_html, registry | from ..util import escape_html, minify_html, registry | ||||||
| from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS | from .templates import ( | ||||||
| from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS |     TPL_DEP_ARCS, | ||||||
| from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN |     TPL_DEP_SVG, | ||||||
| from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL |     TPL_DEP_WORDS, | ||||||
| from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS |     TPL_DEP_WORDS_LEMMA, | ||||||
| from .templates import TPL_TITLE |     TPL_ENT, | ||||||
|  |     TPL_ENT_RTL, | ||||||
|  |     TPL_ENTS, | ||||||
|  |     TPL_FIGURE, | ||||||
|  |     TPL_KB_LINK, | ||||||
|  |     TPL_PAGE, | ||||||
|  |     TPL_SPAN, | ||||||
|  |     TPL_SPAN_RTL, | ||||||
|  |     TPL_SPAN_SLICE, | ||||||
|  |     TPL_SPAN_SLICE_RTL, | ||||||
|  |     TPL_SPAN_START, | ||||||
|  |     TPL_SPAN_START_RTL, | ||||||
|  |     TPL_SPANS, | ||||||
|  |     TPL_TITLE, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| DEFAULT_LANG = "en" | DEFAULT_LANG = "en" | ||||||
| DEFAULT_DIR = "ltr" | DEFAULT_DIR = "ltr" | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| from typing import Literal |  | ||||||
| import warnings | import warnings | ||||||
|  | from typing import Literal | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ErrorsWithCodes(type): | class ErrorsWithCodes(type): | ||||||
|  | @ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes): | ||||||
|             "ignoring the duplicate entry.") |             "ignoring the duplicate entry.") | ||||||
|     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " |     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " | ||||||
|             "incorrect. Modify PhraseMatcher._terminal_hash to fix.") |             "incorrect. Modify PhraseMatcher._terminal_hash to fix.") | ||||||
|     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " |     W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " | ||||||
|             "the Knowledge Base.") |             "the Knowledge Base.") | ||||||
|     W026 = ("Unable to set all sentence boundaries from dependency parses. If " |     W026 = ("Unable to set all sentence boundaries from dependency parses. If " | ||||||
|             "you are constructing a parse tree incrementally by setting " |             "you are constructing a parse tree incrementally by setting " | ||||||
|  | @ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes): | ||||||
|             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") |             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") | ||||||
|     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") |     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") | ||||||
| 
 | 
 | ||||||
|  |     # v4 warning strings | ||||||
|     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") |     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") | ||||||
|  |     W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " | ||||||
|  |             "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " | ||||||
|  |             "to return `True` in `.supports_prior_probs`.") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Errors(metaclass=ErrorsWithCodes): | class Errors(metaclass=ErrorsWithCodes): | ||||||
|  | @ -542,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|             "during training, make sure to include it in 'annotating components'") |             "during training, make sure to include it in 'annotating components'") | ||||||
| 
 | 
 | ||||||
|     # New errors added in v3.x |     # New errors added in v3.x | ||||||
|  |     E850 = ("The PretrainVectors objective currently only supports default or " | ||||||
|  |             "floret vectors, not {mode} vectors.") | ||||||
|     E851 = ("The 'textcat' component labels should only have values of 0 or 1, " |     E851 = ("The 'textcat' component labels should only have values of 0 or 1, " | ||||||
|             "but found value of '{val}'.") |             "but found value of '{val}'.") | ||||||
|     E852 = ("The tar file pulled from the remote attempted an unsafe path " |     E852 = ("The tar file pulled from the remote attempted an unsafe path " | ||||||
|  | @ -922,7 +928,7 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|     E1029 = ("Edit tree cannot be applied to form.") |     E1029 = ("Edit tree cannot be applied to form.") | ||||||
|     E1030 = ("Edit tree identifier out of range.") |     E1030 = ("Edit tree identifier out of range.") | ||||||
|     E1031 = ("Could not find gold transition - see logs above.") |     E1031 = ("Could not find gold transition - see logs above.") | ||||||
|     E1032 = ("`{var}` should not be {forbidden}, but received {value}.") |     E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.") | ||||||
|     E1033 = ("Dimension {name} invalid -- only nO, nF, nP") |     E1033 = ("Dimension {name} invalid -- only nO, nF, nP") | ||||||
|     E1034 = ("Node index {i} out of bounds ({length})") |     E1034 = ("Node index {i} out of bounds ({length})") | ||||||
|     E1035 = ("Token index {i} out of bounds ({length})") |     E1035 = ("Token index {i} out of bounds ({length})") | ||||||
|  | @ -951,6 +957,14 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|              "with `displacy.serve(doc, port=port)`") |              "with `displacy.serve(doc, port=port)`") | ||||||
|     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " |     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " | ||||||
|              "or use `auto_select_port=True` to pick an available port automatically.") |              "or use `auto_select_port=True` to pick an available port automatically.") | ||||||
|  |     E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") | ||||||
|  |     E1052 = ("Unable to copy spans: the character offsets for the span at " | ||||||
|  |              "index {i} in the span group do not align with the tokenization " | ||||||
|  |              "in the target doc.") | ||||||
|  |     E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found" | ||||||
|  |              " 'min_length': {min_length}, 'max_length': {max_length}") | ||||||
|  |     E1054 = ("The text, including whitespace, must match between reference and " | ||||||
|  |              "predicted docs when training {component}.") | ||||||
| 
 | 
 | ||||||
|     # v4 error strings |     # v4 error strings | ||||||
|     E4000 = ("Expected a Doc as input, but got: '{type}'") |     E4000 = ("Expected a Doc as input, but got: '{type}'") | ||||||
|  | @ -961,6 +975,12 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|              "reference and predicted docs.") |              "reference and predicted docs.") | ||||||
|     E4004 = ("Backprop is not supported when is_train is not set.") |     E4004 = ("Backprop is not supported when is_train is not set.") | ||||||
|     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") |     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") | ||||||
|  |     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") | ||||||
|  |     E4007 = ("Span {var} {value} must be {op} Span {existing_var} " | ||||||
|  |              "{existing_value}.") | ||||||
|  |     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.") | ||||||
|  |     E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.") | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} | RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,4 +1,5 @@ | ||||||
| import warnings | import warnings | ||||||
|  | 
 | ||||||
| from .errors import Warnings | from .errors import Warnings | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,3 +1,5 @@ | ||||||
|  | from .candidate import Candidate, InMemoryCandidate | ||||||
| from .kb import KnowledgeBase | from .kb import KnowledgeBase | ||||||
| from .kb_in_memory import InMemoryLookupKB | from .kb_in_memory import InMemoryLookupKB | ||||||
| from .candidate import Candidate, get_candidates, get_candidates_batch | 
 | ||||||
|  | __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] | ||||||
|  |  | ||||||
|  | @ -1,12 +1,17 @@ | ||||||
| from .kb cimport KnowledgeBase |  | ||||||
| from libcpp.vector cimport vector | from libcpp.vector cimport vector | ||||||
| from ..typedefs cimport hash_t |  | ||||||
| 
 | 
 | ||||||
| # Object used by the Entity Linker that summarizes one entity-alias candidate combination. | from ..typedefs cimport hash_t | ||||||
|  | from .kb_in_memory cimport InMemoryLookupKB | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| cdef class Candidate: | cdef class Candidate: | ||||||
|     cdef readonly KnowledgeBase kb |     pass | ||||||
|     cdef hash_t entity_hash | 
 | ||||||
|     cdef float entity_freq | 
 | ||||||
|     cdef vector[float] entity_vector | cdef class InMemoryCandidate(Candidate): | ||||||
|     cdef hash_t alias_hash |     cdef readonly hash_t _entity_hash | ||||||
|     cdef float prior_prob |     cdef readonly hash_t _alias_hash | ||||||
|  |     cdef vector[float] _entity_vector | ||||||
|  |     cdef float _prior_prob | ||||||
|  |     cdef readonly InMemoryLookupKB _kb | ||||||
|  |     cdef float _entity_freq | ||||||
|  |  | ||||||
|  | @ -1,74 +1,98 @@ | ||||||
| # cython: infer_types=True, profile=True | # cython: infer_types=True, profile=True | ||||||
| 
 | 
 | ||||||
| from typing import Iterable | from .kb_in_memory cimport InMemoryLookupKB | ||||||
| from .kb cimport KnowledgeBase | 
 | ||||||
| from ..tokens import Span | from ..errors import Errors | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Candidate: | cdef class Candidate: | ||||||
|     """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved |     """A `Candidate` object refers to a textual mention that may or may not be resolved | ||||||
|     to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking |     to a specific entity from a Knowledge Base. This will be used as input for the entity linking | ||||||
|     algorithm which will disambiguate the various candidates to the correct one. |     algorithm which will disambiguate the various candidates to the correct one. | ||||||
|     Each candidate (alias, entity) pair is assigned a certain prior probability. |     Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, | ||||||
|  |     is assigned a certain prior probability. | ||||||
| 
 | 
 | ||||||
|     DOCS: https://spacy.io/api/kb/#candidate-init |     DOCS: https://spacy.io/api/kb/#candidate-init | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): |     def __init__(self): | ||||||
|         self.kb = kb |         # Make sure abstract Candidate is not instantiated. | ||||||
|         self.entity_hash = entity_hash |         if self.__class__ == Candidate: | ||||||
|         self.entity_freq = entity_freq |             raise TypeError( | ||||||
|         self.entity_vector = entity_vector |                 Errors.E1046.format(cls_name=self.__class__.__name__) | ||||||
|         self.alias_hash = alias_hash |             ) | ||||||
|         self.prior_prob = prior_prob |  | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def entity(self) -> int: |     def entity_id(self) -> int: | ||||||
|         """RETURNS (uint64): hash of the entity's KB ID/name""" |         """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, | ||||||
|         return self.entity_hash |         otherwise the hash of the entity ID string).""" | ||||||
|  |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def entity_(self) -> str: |     def entity_id_(self) -> str: | ||||||
|         """RETURNS (str): ID/name of this entity in the KB""" |         """RETURNS (str): String representation of entity ID.""" | ||||||
|         return self.kb.vocab.strings[self.entity_hash] |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def alias(self) -> int: |     def entity_vector(self) -> vector[float]: | ||||||
|         """RETURNS (uint64): hash of the alias""" |         """RETURNS (vector[float]): Entity vector.""" | ||||||
|         return self.alias_hash |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef class InMemoryCandidate(Candidate): | ||||||
|  |     """Candidate for InMemoryLookupKB.""" | ||||||
|  | 
 | ||||||
|  |     def __init__( | ||||||
|  |         self, | ||||||
|  |         kb: InMemoryLookupKB, | ||||||
|  |         entity_hash: int, | ||||||
|  |         alias_hash: int, | ||||||
|  |         entity_vector: vector[float], | ||||||
|  |         prior_prob: float, | ||||||
|  |         entity_freq: float | ||||||
|  |     ): | ||||||
|  |         """ | ||||||
|  |         kb (InMemoryLookupKB]): InMemoryLookupKB instance. | ||||||
|  |         entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). | ||||||
|  |         entity_freq (int): Entity frequency in KB corpus. | ||||||
|  |         entity_vector (List[float]): Entity embedding. | ||||||
|  |         alias_hash (int): Alias hash. | ||||||
|  |         prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of | ||||||
|  |             the context, this alias - which matches one of this entity's aliases - resolves to one this entity. | ||||||
|  |         """ | ||||||
|  |         super().__init__() | ||||||
|  | 
 | ||||||
|  |         self._entity_hash = entity_hash | ||||||
|  |         self._entity_vector = entity_vector | ||||||
|  |         self._prior_prob = prior_prob | ||||||
|  |         self._kb = kb | ||||||
|  |         self._alias_hash = alias_hash | ||||||
|  |         self._entity_freq = entity_freq | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def alias_(self) -> str: |     def entity_id(self) -> int: | ||||||
|         """RETURNS (str): ID of the original alias""" |         return self._entity_hash | ||||||
|         return self.kb.vocab.strings[self.alias_hash] |  | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def entity_freq(self) -> float: |     def entity_vector(self) -> vector[float]: | ||||||
|         return self.entity_freq |         return self._entity_vector | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def entity_vector(self) -> Iterable[float]: |  | ||||||
|         return self.entity_vector |  | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def prior_prob(self) -> float: |     def prior_prob(self) -> float: | ||||||
|         return self.prior_prob |         """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to | ||||||
|  |         this entity.""" | ||||||
|  |         return self._prior_prob | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def alias(self) -> str: | ||||||
|  |         """RETURNS (str): Alias.""" | ||||||
|  |         return self._kb.vocab.strings[self._alias_hash] | ||||||
| 
 | 
 | ||||||
| def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: |     @property | ||||||
|     """ |     def entity_id_(self) -> str: | ||||||
|     Return candidate entities for a given mention and fetching appropriate entries from the index. |         return self._kb.vocab.strings[self._entity_hash] | ||||||
|     kb (KnowledgeBase): Knowledge base to query. |  | ||||||
|     mention (Span): Entity mention for which to identify candidates. |  | ||||||
|     RETURNS (Iterable[Candidate]): Identified candidates. |  | ||||||
|     """ |  | ||||||
|     return kb.get_candidates(mention) |  | ||||||
| 
 | 
 | ||||||
| 
 |     @property | ||||||
| def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: |     def entity_freq(self) -> float: | ||||||
|     """ |         """RETURNS (float): Entity frequency in KB corpus.""" | ||||||
|     Return candidate entities for the given mentions and fetching appropriate entries from the index. |         return self._entity_freq | ||||||
|     kb (KnowledgeBase): Knowledge base to query. |  | ||||||
|     mention (Iterable[Span]): Entity mentions for which to identify candidates. |  | ||||||
|     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. |  | ||||||
|     """ |  | ||||||
|     return kb.get_candidates_batch(mentions) |  | ||||||
|  |  | ||||||
|  | @ -2,8 +2,10 @@ | ||||||
| 
 | 
 | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from libc.stdint cimport int64_t | from libc.stdint cimport int64_t | ||||||
|  | 
 | ||||||
| from ..vocab cimport Vocab | from ..vocab cimport Vocab | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| cdef class KnowledgeBase: | cdef class KnowledgeBase: | ||||||
|     cdef Pool mem |     cdef Pool mem | ||||||
|     cdef readonly Vocab vocab |     cdef readonly Vocab vocab | ||||||
|  |  | ||||||
|  | @ -2,12 +2,13 @@ | ||||||
| 
 | 
 | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import Iterable, Tuple, Union | from typing import Iterable, Tuple, Union | ||||||
|  | 
 | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| 
 | 
 | ||||||
| from .candidate import Candidate |  | ||||||
| from ..tokens import Span |  | ||||||
| from ..util import SimpleFrozenList |  | ||||||
| from ..errors import Errors | from ..errors import Errors | ||||||
|  | from ..tokens import Span, SpanGroup | ||||||
|  | from ..util import SimpleFrozenList | ||||||
|  | from .candidate import Candidate | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class KnowledgeBase: | cdef class KnowledgeBase: | ||||||
|  | @ -30,21 +31,23 @@ cdef class KnowledgeBase: | ||||||
|         self.entity_vector_length = entity_vector_length |         self.entity_vector_length = entity_vector_length | ||||||
|         self.mem = Pool() |         self.mem = Pool() | ||||||
| 
 | 
 | ||||||
|     def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: |     def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: | ||||||
|         """ |         """ | ||||||
|         Return candidate entities for specified texts. Each candidate defines the entity, the original alias, |         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the | ||||||
|         and the prior probability of that alias resolving to that entity. |         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||||
|         If no candidate is found for a given text, an empty list is returned. |         probability of the specified mention text resolving to that entity - might be included. | ||||||
|         mentions (Iterable[Span]): Mentions for which to get candidates. |         If no candidates are found for a given mention, an empty list is returned. | ||||||
|  |         mentions (SpanGroup): Mentions for which to get candidates. | ||||||
|         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. |         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. | ||||||
|         """ |         """ | ||||||
|         return [self.get_candidates(span) for span in mentions] |         return [self.get_candidates(span) for span in mentions] | ||||||
| 
 | 
 | ||||||
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]: |     def get_candidates(self, mention: Span) -> Iterable[Candidate]: | ||||||
|         """ |         """ | ||||||
|         Return candidate entities for specified text. Each candidate defines the entity, the original alias, |         Return candidate entities for a specific mention. Each candidate defines at least the entity and the | ||||||
|         and the prior probability of that alias resolving to that entity. |         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||||
|         If the no candidate is found for a given text, an empty list is returned. |         probability of the specified mention text resolving to that entity - might be included. | ||||||
|  |         If no candidate is found for the given mention, an empty list is returned. | ||||||
|         mention (Span): Mention for which to get candidates. |         mention (Span): Mention for which to get candidates. | ||||||
|         RETURNS (Iterable[Candidate]): Identified candidates. |         RETURNS (Iterable[Candidate]): Identified candidates. | ||||||
|         """ |         """ | ||||||
|  | @ -106,3 +109,10 @@ cdef class KnowledgeBase: | ||||||
|         raise NotImplementedError( |         raise NotImplementedError( | ||||||
|             Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) |             Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) | ||||||
|         ) |         ) | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def supports_prior_probs(self) -> bool: | ||||||
|  |         """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" | ||||||
|  |         raise NotImplementedError( | ||||||
|  |             Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  | @ -1,11 +1,11 @@ | ||||||
| """Knowledge-base for entity or concept linking.""" | """Knowledge-base for entity or concept linking.""" | ||||||
| from preshed.maps cimport PreshMap |  | ||||||
| from libcpp.vector cimport vector |  | ||||||
| from libc.stdint cimport int32_t, int64_t | from libc.stdint cimport int32_t, int64_t | ||||||
| from libc.stdio cimport FILE | from libc.stdio cimport FILE | ||||||
|  | from libcpp.vector cimport vector | ||||||
|  | from preshed.maps cimport PreshMap | ||||||
| 
 | 
 | ||||||
|  | from ..structs cimport AliasC, KBEntryC | ||||||
| from ..typedefs cimport hash_t | from ..typedefs cimport hash_t | ||||||
| from ..structs cimport KBEntryC, AliasC |  | ||||||
| from .kb cimport KnowledgeBase | from .kb cimport KnowledgeBase | ||||||
| 
 | 
 | ||||||
| ctypedef vector[KBEntryC] entry_vec | ctypedef vector[KBEntryC] entry_vec | ||||||
|  |  | ||||||
|  | @ -1,24 +1,29 @@ | ||||||
| # cython: infer_types=True, profile=True | # cython: infer_types=True, profile=True | ||||||
| from typing import Iterable, Callable, Dict, Any, Union | from typing import Any, Callable, Dict, Iterable, Union | ||||||
| 
 | 
 | ||||||
| import srsly | import srsly | ||||||
| from preshed.maps cimport PreshMap |  | ||||||
| from cpython.exc cimport PyErr_SetFromErrno |  | ||||||
| from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek |  | ||||||
| from libc.stdint cimport int32_t, int64_t |  | ||||||
| from libcpp.vector cimport vector |  | ||||||
| 
 | 
 | ||||||
| from pathlib import Path | from cpython.exc cimport PyErr_SetFromErrno | ||||||
|  | from libc.stdint cimport int32_t, int64_t | ||||||
|  | from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite | ||||||
|  | from libcpp.vector cimport vector | ||||||
|  | from preshed.maps cimport PreshMap | ||||||
|  | 
 | ||||||
| import warnings | import warnings | ||||||
|  | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| from ..tokens import Span | from ..tokens import Span | ||||||
|  | 
 | ||||||
| from ..typedefs cimport hash_t | from ..typedefs cimport hash_t | ||||||
| from ..errors import Errors, Warnings | 
 | ||||||
| from .. import util | from .. import util | ||||||
|  | from ..errors import Errors, Warnings | ||||||
| from ..util import SimpleFrozenList, ensure_path | from ..util import SimpleFrozenList, ensure_path | ||||||
|  | 
 | ||||||
| from ..vocab cimport Vocab | from ..vocab cimport Vocab | ||||||
| from .kb cimport KnowledgeBase | from .kb cimport KnowledgeBase | ||||||
| from .candidate import Candidate as Candidate | 
 | ||||||
|  | from .candidate import InMemoryCandidate | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class InMemoryLookupKB(KnowledgeBase): | cdef class InMemoryLookupKB(KnowledgeBase): | ||||||
|  | @ -226,10 +231,10 @@ cdef class InMemoryLookupKB(KnowledgeBase): | ||||||
|             alias_entry.probs = probs |             alias_entry.probs = probs | ||||||
|             self._aliases_table[alias_index] = alias_entry |             self._aliases_table[alias_index] = alias_entry | ||||||
| 
 | 
 | ||||||
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]: |     def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: | ||||||
|         return self.get_alias_candidates(mention.text)  # type: ignore |         return self._get_alias_candidates(mention.text)  # type: ignore | ||||||
| 
 | 
 | ||||||
|     def get_alias_candidates(self, str alias) -> Iterable[Candidate]: |     def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: | ||||||
|         """ |         """ | ||||||
|         Return candidate entities for an alias. Each candidate defines the entity, the original alias, |         Return candidate entities for an alias. Each candidate defines the entity, the original alias, | ||||||
|         and the prior probability of that alias resolving to that entity. |         and the prior probability of that alias resolving to that entity. | ||||||
|  | @ -241,14 +246,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): | ||||||
|         alias_index = <int64_t>self._alias_index.get(alias_hash) |         alias_index = <int64_t>self._alias_index.get(alias_hash) | ||||||
|         alias_entry = self._aliases_table[alias_index] |         alias_entry = self._aliases_table[alias_index] | ||||||
| 
 | 
 | ||||||
|         return [Candidate(kb=self, |         return [ | ||||||
|  |             InMemoryCandidate( | ||||||
|  |                 kb=self, | ||||||
|                 entity_hash=self._entries[entry_index].entity_hash, |                 entity_hash=self._entries[entry_index].entity_hash, | ||||||
|                           entity_freq=self._entries[entry_index].freq, |  | ||||||
|                           entity_vector=self._vectors_table[self._entries[entry_index].vector_index], |  | ||||||
|                 alias_hash=alias_hash, |                 alias_hash=alias_hash, | ||||||
|                           prior_prob=prior_prob) |                 entity_vector=self._vectors_table[self._entries[entry_index].vector_index], | ||||||
|  |                 prior_prob=prior_prob, | ||||||
|  |                 entity_freq=self._entries[entry_index].freq | ||||||
|  |             ) | ||||||
|             for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) |             for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) | ||||||
|                 if entry_index != 0] |             if entry_index != 0 | ||||||
|  |         ] | ||||||
| 
 | 
 | ||||||
|     def get_vector(self, str entity): |     def get_vector(self, str entity): | ||||||
|         cdef hash_t entity_hash = self.vocab.strings[entity] |         cdef hash_t entity_hash = self.vocab.strings[entity] | ||||||
|  | @ -279,6 +288,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | ||||||
| 
 | 
 | ||||||
|         return 0.0 |         return 0.0 | ||||||
| 
 | 
 | ||||||
|  |     def supports_prior_probs(self) -> bool: | ||||||
|  |         return True | ||||||
|  | 
 | ||||||
|     def to_bytes(self, **kwargs): |     def to_bytes(self, **kwargs): | ||||||
|         """Serialize the current state to a binary string. |         """Serialize the current state to a binary string. | ||||||
|         """ |         """ | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
|  | from ...language import BaseDefaults, Language | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class AfrikaansDefaults(BaseDefaults): | class AfrikaansDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,12 +1,11 @@ | ||||||
| from .stop_words import STOP_WORDS | from ...attrs import LANG | ||||||
|  | from ...language import BaseDefaults, Language | ||||||
|  | from ...util import update_exc | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from .punctuation import TOKENIZER_SUFFIXES | from .punctuation import TOKENIZER_SUFFIXES | ||||||
| 
 | from .stop_words import STOP_WORDS | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS |  | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| from ...attrs import LANG |  | ||||||
| from ...util import update_exc |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class AmharicDefaults(BaseDefaults): | class AmharicDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,5 +1,11 @@ | ||||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | from ..char_classes import ( | ||||||
| from ..char_classes import UNITS, ALPHA_UPPER |     ALPHA_UPPER, | ||||||
|  |     CURRENCY, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  |     UNITS, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() | _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,4 @@ | ||||||
| from ...symbols import ORTH, NORM | from ...symbols import NORM, ORTH | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,8 +1,8 @@ | ||||||
| from .stop_words import STOP_WORDS | from ...language import BaseDefaults, Language | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from .punctuation import TOKENIZER_SUFFIXES | from .punctuation import TOKENIZER_SUFFIXES | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ArabicDefaults(BaseDefaults): | class ArabicDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,5 +1,11 @@ | ||||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | from ..char_classes import ( | ||||||
| from ..char_classes import UNITS, ALPHA_UPPER |     ALPHA_UPPER, | ||||||
|  |     CURRENCY, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  |     UNITS, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| _suffixes = ( | _suffixes = ( | ||||||
|     LIST_PUNCT |     LIST_PUNCT | ||||||
|  |  | ||||||
|  | @ -1,7 +1,6 @@ | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ...symbols import NORM, ORTH | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
| 
 | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| from .stop_words import STOP_WORDS | from ...language import BaseDefaults, Language | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from ...language import Language, BaseDefaults | from .stop_words import STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class AzerbaijaniDefaults(BaseDefaults): | class AzerbaijaniDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| from ...attrs import LIKE_NUM | from ...attrs import LIKE_NUM | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # Eleven, twelve etc. are written separate: on bir, on iki | # Eleven, twelve etc. are written separate: on bir, on iki | ||||||
| 
 | 
 | ||||||
| _num_words = [ | _num_words = [ | ||||||
|  |  | ||||||
|  | @ -1,12 +1,14 @@ | ||||||
|  | from ...attrs import LANG | ||||||
|  | from ...language import BaseDefaults, Language | ||||||
|  | from ...util import update_exc | ||||||
|  | from ..punctuation import ( | ||||||
|  |     COMBINING_DIACRITICS_TOKENIZER_INFIXES, | ||||||
|  |     COMBINING_DIACRITICS_TOKENIZER_SUFFIXES, | ||||||
|  | ) | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from .lex_attrs import LEX_ATTRS |  | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS |  | ||||||
| from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES |  | ||||||
| from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES |  | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| from ...attrs import LANG |  | ||||||
| from ...util import update_exc |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BulgarianDefaults(BaseDefaults): | class BulgarianDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| from ...attrs import LIKE_NUM | from ...attrs import LIKE_NUM | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| _num_words = [ | _num_words = [ | ||||||
|     "нула", |     "нула", | ||||||
|     "едно", |     "едно", | ||||||
|  |  | ||||||
|  | @ -4,8 +4,7 @@ References: | ||||||
|     (countries, occupations, fields of studies and more). |     (countries, occupations, fields of studies and more). | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| from ...symbols import ORTH, NORM | from ...symbols import NORM, ORTH | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,10 +1,12 @@ | ||||||
| from typing import Optional, Callable | from typing import Callable, Optional | ||||||
|  | 
 | ||||||
| from thinc.api import Model | from thinc.api import Model | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | 
 | ||||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | from ...language import BaseDefaults, Language | ||||||
| from .stop_words import STOP_WORDS |  | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| from ...pipeline import Lemmatizer | from ...pipeline import Lemmatizer | ||||||
|  | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BengaliDefaults(BaseDefaults): | class BengaliDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,6 +1,14 @@ | ||||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | from ..char_classes import ( | ||||||
| from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS |     ALPHA, | ||||||
| 
 |     ALPHA_LOWER, | ||||||
|  |     CONCAT_QUOTES, | ||||||
|  |     HYPHENS, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  |     UNITS, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| _currency = r"\$¢£€¥฿৳" | _currency = r"\$¢£€¥฿৳" | ||||||
| _quotes = CONCAT_QUOTES.replace("'", "") | _quotes = CONCAT_QUOTES.replace("'", "") | ||||||
|  |  | ||||||
|  | @ -1,7 +1,6 @@ | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ...symbols import NORM, ORTH | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
| 
 | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,14 +1,14 @@ | ||||||
| from typing import Optional, Callable | from typing import Callable, Optional | ||||||
| 
 | 
 | ||||||
| from thinc.api import Model | from thinc.api import Model | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from ...language import BaseDefaults, Language | ||||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES |  | ||||||
| from .stop_words import STOP_WORDS |  | ||||||
| from .lex_attrs import LEX_ATTRS |  | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS |  | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| from .lemmatizer import CatalanLemmatizer | from .lemmatizer import CatalanLemmatizer | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class CatalanDefaults(BaseDefaults): | class CatalanDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| from ...attrs import LIKE_NUM | from ...attrs import LIKE_NUM | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| _num_words = [ | _num_words = [ | ||||||
|     "zero", |     "zero", | ||||||
|     "un", |     "un", | ||||||
|  |  | ||||||
|  | @ -1,9 +1,18 @@ | ||||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | from ..char_classes import ( | ||||||
| from ..char_classes import LIST_CURRENCY |     ALPHA, | ||||||
| from ..char_classes import CURRENCY |     ALPHA_LOWER, | ||||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT |     ALPHA_UPPER, | ||||||
| from ..char_classes import merge_chars, _units |     CONCAT_QUOTES, | ||||||
| 
 |     CURRENCY, | ||||||
|  |     LIST_CURRENCY, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  |     PUNCT, | ||||||
|  |     _units, | ||||||
|  |     merge_chars, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") | ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,7 +1,8 @@ | ||||||
| from typing import Union, Iterator, Tuple | from typing import Iterator, Tuple, Union | ||||||
| from ...tokens import Doc, Span | 
 | ||||||
| from ...symbols import NOUN, PROPN |  | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
|  | from ...symbols import NOUN, PROPN | ||||||
|  | from ...tokens import Doc, Span | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||||
|  |  | ||||||
|  | @ -1,7 +1,6 @@ | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ...symbols import NORM, ORTH | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
| 
 | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| from .stop_words import STOP_WORDS | from ...language import BaseDefaults, Language | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from ...language import Language, BaseDefaults | from .stop_words import STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class CzechDefaults(BaseDefaults): | class CzechDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,9 +1,9 @@ | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from ...language import BaseDefaults, Language | ||||||
|  | from .lex_attrs import LEX_ATTRS | ||||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .lex_attrs import LEX_ATTRS |  | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| from ...language import Language, BaseDefaults | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class DanishDefaults(BaseDefaults): | class DanishDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,6 +1,5 @@ | ||||||
| from ...attrs import LIKE_NUM | from ...attrs import LIKE_NUM | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # Source http://fjern-uv.dk/tal.php | # Source http://fjern-uv.dk/tal.php | ||||||
| _num_words = """nul | _num_words = """nul | ||||||
| en et to tre fire fem seks syv otte ni ti | en et to tre fire fem seks syv otte ni ti | ||||||
|  |  | ||||||
|  | @ -1,8 +1,13 @@ | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | from ..char_classes import ( | ||||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER |     ALPHA, | ||||||
|  |     ALPHA_LOWER, | ||||||
|  |     ALPHA_UPPER, | ||||||
|  |     CONCAT_QUOTES, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  | ) | ||||||
| from ..punctuation import TOKENIZER_SUFFIXES | from ..punctuation import TOKENIZER_SUFFIXES | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| _quotes = CONCAT_QUOTES.replace("'", "") | _quotes = CONCAT_QUOTES.replace("'", "") | ||||||
| 
 | 
 | ||||||
| _infixes = ( | _infixes = ( | ||||||
|  |  | ||||||
|  | @ -1,7 +1,8 @@ | ||||||
| from typing import Union, Iterator, Tuple | from typing import Iterator, Tuple, Union | ||||||
| from ...tokens import Doc, Span | 
 | ||||||
| from ...symbols import NOUN, PROPN, PRON, VERB, AUX |  | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
|  | from ...symbols import AUX, NOUN, PRON, PROPN, VERB | ||||||
|  | from ...tokens import Doc, Span | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||||
|  |  | ||||||
|  | @ -2,10 +2,9 @@ | ||||||
| Tokenizer Exceptions. | Tokenizer Exceptions. | ||||||
| Source: https://forkortelse.dk/ and various others. | Source: https://forkortelse.dk/ and various others. | ||||||
| """ | """ | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ...symbols import NORM, ORTH | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
| 
 | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,8 +1,8 @@ | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from ...language import BaseDefaults, Language | ||||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| from ...language import Language, BaseDefaults | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class GermanDefaults(BaseDefaults): | class GermanDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,9 +1,18 @@ | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES | from ..char_classes import ( | ||||||
| from ..char_classes import CURRENCY, UNITS, PUNCT |     ALPHA, | ||||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER |     ALPHA_LOWER, | ||||||
|  |     ALPHA_UPPER, | ||||||
|  |     CONCAT_QUOTES, | ||||||
|  |     CURRENCY, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  |     PUNCT, | ||||||
|  |     UNITS, | ||||||
|  | ) | ||||||
| from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES | from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES | _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES | ||||||
| 
 | 
 | ||||||
| _suffixes = ( | _suffixes = ( | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| from typing import Union, Iterator, Tuple | from typing import Iterator, Tuple, Union | ||||||
| 
 | 
 | ||||||
| from ...symbols import NOUN, PROPN, PRON |  | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
|  | from ...symbols import NOUN, PRON, PROPN | ||||||
| from ...tokens import Doc, Span | from ...tokens import Doc, Span | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,7 +1,6 @@ | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ...symbols import NORM, ORTH | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
| 
 | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc = { | _exc = { | ||||||
|     "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], |     "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
|  | from ...language import BaseDefaults, Language | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from ...language import Language, BaseDefaults |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class LowerSorbianDefaults(BaseDefaults): | class LowerSorbianDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,13 +1,14 @@ | ||||||
| from typing import Optional, Callable | from typing import Callable, Optional | ||||||
|  | 
 | ||||||
| from thinc.api import Model | from thinc.api import Model | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from ...language import BaseDefaults, Language | ||||||
| from .stop_words import STOP_WORDS |  | ||||||
| from .lex_attrs import LEX_ATTRS |  | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS |  | ||||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES |  | ||||||
| from .lemmatizer import GreekLemmatizer | from .lemmatizer import GreekLemmatizer | ||||||
| from ...language import Language, BaseDefaults | from .lex_attrs import LEX_ATTRS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class GreekDefaults(BaseDefaults): | class GreekDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,5 +1,6 @@ | ||||||
| def get_pos_from_wiktionary(): | def get_pos_from_wiktionary(): | ||||||
|     import re |     import re | ||||||
|  | 
 | ||||||
|     from gensim.corpora.wikicorpus import extract_pages |     from gensim.corpora.wikicorpus import extract_pages | ||||||
| 
 | 
 | ||||||
|     regex = re.compile(r"==={{(\w+)\|el}}===") |     regex = re.compile(r"==={{(\w+)\|el}}===") | ||||||
|  |  | ||||||
|  | @ -1,6 +1,16 @@ | ||||||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY | from ..char_classes import ( | ||||||
| from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS |     ALPHA, | ||||||
| from ..char_classes import CONCAT_QUOTES, CURRENCY |     ALPHA_LOWER, | ||||||
|  |     ALPHA_UPPER, | ||||||
|  |     CONCAT_QUOTES, | ||||||
|  |     CURRENCY, | ||||||
|  |     HYPHENS, | ||||||
|  |     LIST_CURRENCY, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| _units = ( | _units = ( | ||||||
|     "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " |     "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| from typing import Union, Iterator, Tuple | from typing import Iterator, Tuple, Union | ||||||
| 
 | 
 | ||||||
| from ...symbols import NOUN, PROPN, PRON |  | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
|  | from ...symbols import NOUN, PRON, PROPN | ||||||
| from ...tokens import Doc, Span | from ...tokens import Doc, Span | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | from ...symbols import NORM, ORTH | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc | from ...util import update_exc | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc = {} | _exc = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,13 +1,14 @@ | ||||||
| from typing import Optional, Callable | from typing import Callable, Optional | ||||||
|  | 
 | ||||||
| from thinc.api import Model | from thinc.api import Model | ||||||
| 
 | 
 | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | from ...language import BaseDefaults, Language | ||||||
| from .stop_words import STOP_WORDS |  | ||||||
| from .lex_attrs import LEX_ATTRS |  | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS |  | ||||||
| from .punctuation import TOKENIZER_INFIXES |  | ||||||
| from .lemmatizer import EnglishLemmatizer | from .lemmatizer import EnglishLemmatizer | ||||||
| from ...language import Language, BaseDefaults | from .lex_attrs import LEX_ATTRS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES | ||||||
|  | from .stop_words import STOP_WORDS | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class EnglishDefaults(BaseDefaults): | class EnglishDefaults(BaseDefaults): | ||||||
|  |  | ||||||
|  | @ -1,5 +1,12 @@ | ||||||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS | from ..char_classes import ( | ||||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA |     ALPHA, | ||||||
|  |     ALPHA_LOWER, | ||||||
|  |     ALPHA_UPPER, | ||||||
|  |     CONCAT_QUOTES, | ||||||
|  |     HYPHENS, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| _infixes = ( | _infixes = ( | ||||||
|     LIST_ELLIPSES |     LIST_ELLIPSES | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| from typing import Union, Iterator, Tuple | from typing import Iterator, Tuple, Union | ||||||
| 
 | 
 | ||||||
| from ...symbols import NOUN, PROPN, PRON |  | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
|  | from ...symbols import NOUN, PRON, PROPN | ||||||
| from ...tokens import Doc, Span | from ...tokens import Doc, Span | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,8 +1,8 @@ | ||||||
| from typing import Dict, List | from typing import Dict, List | ||||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS |  | ||||||
| from ...symbols import ORTH, NORM |  | ||||||
| from ...util import update_exc |  | ||||||
| 
 | 
 | ||||||
|  | from ...symbols import NORM, ORTH | ||||||
|  | from ...util import update_exc | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| _exc: Dict[str, List[Dict]] = {} | _exc: Dict[str, List[Dict]] = {} | ||||||
| _exclude = [ | _exclude = [ | ||||||
|  |  | ||||||
|  | @ -1,12 +1,14 @@ | ||||||
| from typing import Optional, Callable | from typing import Callable, Optional | ||||||
|  | 
 | ||||||
| from thinc.api import Model | from thinc.api import Model | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | 
 | ||||||
| from .stop_words import STOP_WORDS | from ...language import BaseDefaults, Language | ||||||
| from .lex_attrs import LEX_ATTRS |  | ||||||
| from .lemmatizer import SpanishLemmatizer | from .lemmatizer import SpanishLemmatizer | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .lex_attrs import LEX_ATTRS | ||||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||||
| from ...language import Language, BaseDefaults | from .stop_words import STOP_WORDS | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SpanishDefaults(BaseDefaults): | class SpanishDefaults(BaseDefaults): | ||||||
|  |  | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user