mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'v4' into feature/multiple-code-files
This commit is contained in:
		
						commit
						28c8a577fc
					
				
							
								
								
									
										129
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										129
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -1,129 +0,0 @@ | |||
| parameters: | ||||
|   python_version: '' | ||||
|   architecture: 'x64' | ||||
|   num_build_jobs: 2 | ||||
| 
 | ||||
| steps: | ||||
|   - task: UsePythonVersion@0 | ||||
|     inputs: | ||||
|       versionSpec: ${{ parameters.python_version }} | ||||
|       architecture: ${{ parameters.architecture }} | ||||
|       allowUnstable: true | ||||
| 
 | ||||
|   - bash: | | ||||
|       echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" | ||||
|     displayName: 'Set variables' | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m pip install -U build pip setuptools | ||||
|       python -m pip install -U -r requirements.txt | ||||
|     displayName: "Install dependencies" | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m build --sdist | ||||
|     displayName: "Build sdist" | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m mypy spacy | ||||
|     displayName: 'Run mypy' | ||||
|     condition: ne(variables['python_version'], '3.6') | ||||
| 
 | ||||
|   - task: DeleteFiles@1 | ||||
|     inputs: | ||||
|       contents: "spacy" | ||||
|     displayName: "Delete source directory" | ||||
| 
 | ||||
|   - task: DeleteFiles@1 | ||||
|     inputs: | ||||
|       contents: "*.egg-info" | ||||
|     displayName: "Delete egg-info directory" | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m pip freeze > installed.txt | ||||
|       python -m pip uninstall -y -r installed.txt | ||||
|     displayName: "Uninstall all packages" | ||||
| 
 | ||||
|   - bash: | | ||||
|       SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) | ||||
|       SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST | ||||
|     displayName: "Install from sdist" | ||||
| 
 | ||||
|   - script: | | ||||
|       python -W error -c "import spacy" | ||||
|     displayName: "Test import" | ||||
| 
 | ||||
| #  - script: | | ||||
| #      python -m spacy download ca_core_news_sm | ||||
| #      python -m spacy download ca_core_news_md | ||||
| #      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" | ||||
| #    displayName: 'Test download CLI' | ||||
| #    condition: eq(variables['python_version'], '3.8') | ||||
| # | ||||
| #  - script: | | ||||
| #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" | ||||
| #    displayName: 'Test no warnings on load (#11713)' | ||||
| #    condition: eq(variables['python_version'], '3.8') | ||||
| # | ||||
| #  - script: | | ||||
| #      python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping | ||||
| #    displayName: 'Test skip re-download (#12188)' | ||||
| #    condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
| #  - script: | | ||||
| #      python -W error -m spacy info ca_core_news_sm | grep -q download_url | ||||
| #    displayName: 'Test download_url in info CLI' | ||||
| #    condition: eq(variables['python_version'] '3.8') | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . | ||||
|     displayName: 'Test convert CLI' | ||||
|     condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m spacy init config -p ner -l ca ner.cfg | ||||
|       python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | ||||
|     displayName: 'Test debug config CLI' | ||||
|     condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
|   - script: | | ||||
|       # will have errors due to sparse data, check for summary in output | ||||
|       python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary | ||||
|     displayName: 'Test debug data CLI' | ||||
|     condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 | ||||
|     displayName: 'Test train CLI' | ||||
|     condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
| #  - script: | | ||||
| #      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" | ||||
| #      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir | ||||
| #    displayName: 'Test assemble CLI' | ||||
| #    condition: eq(variables['python_version'], '3.8') | ||||
| # | ||||
| #  - script: | | ||||
| #      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" | ||||
| #      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 | ||||
| #    displayName: 'Test assemble CLI vectors warning' | ||||
| #    condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m pip install -U -r requirements.txt | ||||
|     displayName: "Install test requirements" | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m pytest --pyargs spacy -W error | ||||
|     displayName: "Run CPU tests" | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m pip install 'spacy[apple]' | ||||
|       python -m pytest --pyargs spacy | ||||
|     displayName: "Run CPU tests with thinc-apple-ops" | ||||
|     condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) | ||||
| 
 | ||||
|   - script: | | ||||
|       python .github/validate_universe_json.py website/meta/universe.json | ||||
|     displayName: 'Test website/meta/universe.json' | ||||
|     condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
							
								
								
									
										45
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										45
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -1,45 +0,0 @@ | |||
| # GitHub Action that uses Black to reformat all Python code and submits a PR | ||||
| # in regular intervals. Inspired by: https://github.com/cclauss/autoblack | ||||
| 
 | ||||
| name: autoblack | ||||
| on: | ||||
|   workflow_dispatch:  # allow manual trigger | ||||
|   schedule: | ||||
|     - cron: '0 8 * * 5'  # every Friday at 8am UTC | ||||
| 
 | ||||
| jobs: | ||||
|   autoblack: | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: actions/checkout@v3 | ||||
|         with: | ||||
|             ref: ${{ github.head_ref }} | ||||
|       - uses: actions/setup-python@v4 | ||||
|       - run: pip install black -c requirements.txt | ||||
|       - name: Auto-format code if needed | ||||
|         run: black spacy | ||||
|       # We can't run black --check here because that returns a non-zero excit | ||||
|       # code and makes GitHub think the action failed | ||||
|       - name: Check for modified files | ||||
|         id: git-check | ||||
|         run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT | ||||
| 
 | ||||
|       - name: Create Pull Request | ||||
|         if: steps.git-check.outputs.modified == 'true' | ||||
|         uses: peter-evans/create-pull-request@v4 | ||||
|         with: | ||||
|             title: Auto-format code with black | ||||
|             labels: meta | ||||
|             commit-message: Auto-format code with black | ||||
|             committer: GitHub <noreply@github.com> | ||||
|             author: explosion-bot <explosion-bot@users.noreply.github.com> | ||||
|             body: _This PR is auto-generated._ | ||||
|             branch: autoblack | ||||
|             delete-branch: true | ||||
|             draft: false | ||||
|       - name: Check outputs | ||||
|         if: steps.git-check.outputs.modified == 'true' | ||||
|         run: | | ||||
|           echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" | ||||
|           echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}" | ||||
							
								
								
									
										1
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/explosionbot.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -8,6 +8,7 @@ on: | |||
| 
 | ||||
| jobs: | ||||
|   explosion-bot: | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Dump GitHub context | ||||
|  |  | |||
							
								
								
									
										1
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -13,6 +13,7 @@ on: | |||
| 
 | ||||
| jobs: | ||||
|   issue-manager: | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: tiangolo/issue-manager@0.4.0 | ||||
|  |  | |||
							
								
								
									
										1
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -13,6 +13,7 @@ concurrency: | |||
| 
 | ||||
| jobs: | ||||
|   action: | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: dessant/lock-threads@v4 | ||||
|  |  | |||
							
								
								
									
										1
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/spacy_universe_alert.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -7,6 +7,7 @@ on: | |||
| 
 | ||||
| jobs: | ||||
|   build: | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
| 
 | ||||
|     steps: | ||||
|  |  | |||
							
								
								
									
										173
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										173
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,173 @@ | |||
| name: tests | ||||
| 
 | ||||
| on: | ||||
|   push: | ||||
|     branches-ignore: | ||||
|       - "spacy.io" | ||||
|       - "nightly.spacy.io" | ||||
|       - "v2.spacy.io" | ||||
|     paths-ignore: | ||||
|       - "*.md" | ||||
|       - "*.mdx" | ||||
|       - "website/**" | ||||
|       - ".github/workflows/**" | ||||
|   pull_request: | ||||
|     types: [opened, synchronize, reopened, edited] | ||||
|     paths-ignore: | ||||
|       - "*.md" | ||||
|       - "*.mdx" | ||||
|       - "website/**" | ||||
| 
 | ||||
| jobs: | ||||
|   validate: | ||||
|     name: Validate | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Check out repo | ||||
|         uses: actions/checkout@v3 | ||||
| 
 | ||||
|       - name: Configure Python version | ||||
|         uses: actions/setup-python@v4 | ||||
|         with: | ||||
|           python-version: "3.8" | ||||
|           architecture: x64 | ||||
| 
 | ||||
|       - name: black | ||||
|         run: | | ||||
|           python -m pip install black -c requirements.txt | ||||
|           python -m black spacy --check | ||||
|       - name: isort | ||||
|         run: | | ||||
|           python -m pip install isort -c requirements.txt | ||||
|           python -m isort spacy --check | ||||
|       - name: flake8 | ||||
|         run: | | ||||
|           python -m pip install flake8==5.0.4 | ||||
|           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics | ||||
|   tests: | ||||
|     name: Test | ||||
|     needs: Validate | ||||
|     strategy: | ||||
|       fail-fast: true | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|         python_version: ["3.11"] | ||||
|         include: | ||||
|           - os: macos-latest | ||||
|             python_version: "3.8" | ||||
|           - os: ubuntu-20.04 | ||||
|             python_version: "3.9" | ||||
|           - os: windows-latest | ||||
|             python_version: "3.10" | ||||
| 
 | ||||
|     runs-on: ${{ matrix.os }} | ||||
| 
 | ||||
|     steps: | ||||
|       - name: Check out repo | ||||
|         uses: actions/checkout@v3 | ||||
| 
 | ||||
|       - name: Configure Python version | ||||
|         uses: actions/setup-python@v4 | ||||
|         with: | ||||
|           python-version: ${{ matrix.python_version }} | ||||
|           architecture: x64 | ||||
| 
 | ||||
|       - name: Install dependencies | ||||
|         run: | | ||||
|           python -m pip install -U build pip setuptools | ||||
|           python -m pip install -U -r requirements.txt | ||||
| 
 | ||||
|       - name: Build sdist | ||||
|         run: | | ||||
|           python -m build --sdist | ||||
| 
 | ||||
|       - name: Run mypy | ||||
|         run: | | ||||
|           python -m mypy spacy | ||||
| 
 | ||||
|       - name: Delete source directory and .egg-info | ||||
|         run: | | ||||
|           rm -rf spacy *.egg-info | ||||
|         shell: bash | ||||
| 
 | ||||
|       - name: Uninstall all packages | ||||
|         run: | | ||||
|           python -m pip freeze | ||||
|           python -m pip freeze --exclude pywin32 > installed.txt | ||||
|           python -m pip uninstall -y -r installed.txt | ||||
| 
 | ||||
|       - name: Install from sdist | ||||
|         run: | | ||||
|           SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) | ||||
|           SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST | ||||
|         shell: bash | ||||
| 
 | ||||
|       - name: Test import | ||||
|         run: python -W error -c "import spacy" | ||||
| 
 | ||||
|       #      - name: "Test download CLI" | ||||
|       #        run: | | ||||
|       #          python -m spacy download ca_core_news_sm | ||||
|       #          python -m spacy download ca_core_news_md | ||||
|       #          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" | ||||
|       #        if: matrix.python_version == '3.9' | ||||
|       # | ||||
|       #      - name: "Test download_url in info CLI" | ||||
|       #        run: | | ||||
|       #          python -W error -m spacy info ca_core_news_sm | grep -q download_url | ||||
|       #        if: matrix.python_version == '3.9' | ||||
|       # | ||||
|       #      - name: "Test no warnings on load (#11713)" | ||||
|       #        run: | | ||||
|       #          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" | ||||
|       #        if: matrix.python_version == '3.9' | ||||
| 
 | ||||
|       - name: "Test convert CLI" | ||||
|         run: | | ||||
|           python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . | ||||
|         if: matrix.python_version == '3.9' | ||||
| 
 | ||||
|       - name: "Test debug config CLI" | ||||
|         run: | | ||||
|           python -m spacy init config -p ner -l ca ner.cfg | ||||
|           python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | ||||
|         if: matrix.python_version == '3.9' | ||||
| 
 | ||||
|       - name: "Test debug data CLI" | ||||
|         run: | | ||||
|           # will have errors due to sparse data, check for summary in output | ||||
|           python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary | ||||
|         if: matrix.python_version == '3.9' | ||||
| 
 | ||||
|       - name: "Test train CLI" | ||||
|         run: | | ||||
|           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 | ||||
|         if: matrix.python_version == '3.9' | ||||
| 
 | ||||
|       #      - name: "Test assemble CLI" | ||||
|       #        run: | | ||||
|       #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" | ||||
|       #          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir | ||||
|       #        if: matrix.python_version == '3.9' | ||||
|       # | ||||
|       #      - name: "Test assemble CLI vectors warning" | ||||
|       #        run: | | ||||
|       #          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" | ||||
|       #          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 | ||||
|       #        if: matrix.python_version == '3.9' | ||||
| 
 | ||||
|       - name: "Install test requirements" | ||||
|         run: | | ||||
|           python -m pip install -U -r requirements.txt | ||||
| 
 | ||||
|       - name: "Run CPU tests" | ||||
|         run: | | ||||
|           python -m pytest --pyargs spacy -W error | ||||
|         if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')" | ||||
| 
 | ||||
|       - name: "Run CPU tests with thinc-apple-ops" | ||||
|         run: | | ||||
|           python -m pip install 'spacy[apple]' | ||||
|           python -m pytest --pyargs spacy | ||||
|         if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11' | ||||
							
								
								
									
										33
									
								
								.github/workflows/universe_validation.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.github/workflows/universe_validation.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | |||
| name: universe validation | ||||
| 
 | ||||
| on: | ||||
|   push: | ||||
|     branches-ignore: | ||||
|       - "spacy.io" | ||||
|       - "nightly.spacy.io" | ||||
|       - "v2.spacy.io" | ||||
|     paths: | ||||
|       - "website/meta/universe.json" | ||||
|   pull_request: | ||||
|     types: [opened, synchronize, reopened, edited] | ||||
|     paths: | ||||
|       - "website/meta/universe.json" | ||||
| 
 | ||||
| jobs: | ||||
|   validate: | ||||
|     name: Validate | ||||
|     if: github.repository_owner == 'explosion' | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Check out repo | ||||
|         uses: actions/checkout@v3 | ||||
| 
 | ||||
|       - name: Configure Python version | ||||
|         uses: actions/setup-python@v4 | ||||
|         with: | ||||
|           python-version: "3.8" | ||||
|           architecture: x64 | ||||
| 
 | ||||
|       - name: Validate website/meta/universe.json | ||||
|         run: | | ||||
|           python .github/validate_universe_json.py website/meta/universe.json | ||||
|  | @ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy | |||
| model packaging, deployment and workflow management. spaCy is commercial | ||||
| open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). | ||||
| 
 | ||||
| 💥 **We'd love to hear more about your experience with spaCy!** | ||||
| [Fill out our survey here.](https://form.typeform.com/to/aMel9q9f) | ||||
| 
 | ||||
| 💫 **Version 3.5 out now!** | ||||
| [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | ||||
| 
 | ||||
|  | @ -33,7 +36,7 @@ open-source software, released under the [MIT license](https://github.com/explos | |||
| ## 📖 Documentation | ||||
| 
 | ||||
| | Documentation                 |                                                                        | | ||||
| | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | ----------------------------- | ---------------------------------------------------------------------- | | ||||
| | ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      | | ||||
| | 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     | | ||||
| | 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         | | ||||
|  | @ -41,6 +44,7 @@ open-source software, released under the [MIT license](https://github.com/explos | |||
| | 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                | | ||||
| | 📦 **[Models]**               | Download trained pipelines for spaCy.                                  | | ||||
| | 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         | | ||||
| | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. | | ||||
| | 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | ||||
| | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | ||||
| | 🛠 **[Changelog]** | Changes and version history. | | ||||
|  | @ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos | |||
| [api reference]: https://spacy.io/api/ | ||||
| [models]: https://spacy.io/models | ||||
| [universe]: https://spacy.io/universe | ||||
| [spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode | ||||
| [videos]: https://www.youtube.com/c/ExplosionAI | ||||
| [online course]: https://course.spacy.io | ||||
| [project templates]: https://github.com/explosion/projects | ||||
| [changelog]: https://spacy.io/usage#changelog | ||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||
| 
 | ||||
| 
 | ||||
| ## 💬 Where to ask questions | ||||
| 
 | ||||
| The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). | ||||
|  |  | |||
|  | @ -1,99 +0,0 @@ | |||
| trigger: | ||||
|   batch: true | ||||
|   branches: | ||||
|     include: | ||||
|       - "*" | ||||
|     exclude: | ||||
|       - "spacy.io" | ||||
|       - "nightly.spacy.io" | ||||
|       - "v2.spacy.io" | ||||
|   paths: | ||||
|     exclude: | ||||
|       - "website/*" | ||||
|       - "*.md" | ||||
|       - "*.mdx" | ||||
|       - ".github/workflows/*" | ||||
| pr: | ||||
|   paths: | ||||
|     exclude: | ||||
|       - "*.md" | ||||
|       - "*.mdx" | ||||
|       - "website/docs/*" | ||||
|       - "website/src/*" | ||||
|       - "website/meta/*.tsx" | ||||
|       - "website/meta/*.mjs" | ||||
|       - "website/meta/languages.json" | ||||
|       - "website/meta/site.json" | ||||
|       - "website/meta/sidebars.json" | ||||
|       - "website/meta/type-annotations.json" | ||||
|       - "website/pages/*" | ||||
|       - ".github/workflows/*" | ||||
| 
 | ||||
| jobs: | ||||
|   # Check formatting and linting. Perform basic checks for most important errors | ||||
|   # (syntax etc.) Uses the config defined in setup.cfg and overwrites the | ||||
|   # selected codes. | ||||
|   - job: "Validate" | ||||
|     pool: | ||||
|       vmImage: "ubuntu-latest" | ||||
|     steps: | ||||
|       - task: UsePythonVersion@0 | ||||
|         inputs: | ||||
|           versionSpec: "3.8" | ||||
|       - script: | | ||||
|           pip install black -c requirements.txt | ||||
|           python -m black spacy --check | ||||
|         displayName: "black" | ||||
|       - script: | | ||||
|           pip install flake8==5.0.4 | ||||
|           python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics | ||||
|         displayName: "flake8" | ||||
| 
 | ||||
|   - job: "Test" | ||||
|     dependsOn: "Validate" | ||||
|     strategy: | ||||
|       matrix: | ||||
|         # We're only running one platform per Python version to speed up builds | ||||
|         #        Python38Linux: | ||||
|         #          imageName: "ubuntu-latest" | ||||
|         #          python.version: "3.8" | ||||
|         #        Python38Windows: | ||||
|         #          imageName: "windows-latest" | ||||
|         #          python.version: "3.8" | ||||
|         Python38Mac: | ||||
|           imageName: "macos-latest" | ||||
|           python.version: "3.8" | ||||
|         Python39Linux: | ||||
|           imageName: "ubuntu-latest" | ||||
|           python.version: "3.9" | ||||
|         #        Python39Windows: | ||||
|         #          imageName: "windows-latest" | ||||
|         #          python.version: "3.9" | ||||
|         #        Python39Mac: | ||||
|         #          imageName: "macos-latest" | ||||
|         #          python.version: "3.9" | ||||
|         #        Python310Linux: | ||||
|         #          imageName: "ubuntu-latest" | ||||
|         #          python.version: "3.10" | ||||
|         Python310Windows: | ||||
|           imageName: "windows-latest" | ||||
|           python.version: "3.10" | ||||
|         #        Python310Mac: | ||||
|         #          imageName: "macos-latest" | ||||
|         #          python.version: "3.10" | ||||
|         Python311Linux: | ||||
|           imageName: 'ubuntu-latest' | ||||
|           python.version: '3.11' | ||||
|         Python311Windows: | ||||
|           imageName: 'windows-latest' | ||||
|           python.version: '3.11' | ||||
|         Python311Mac: | ||||
|           imageName: 'macos-latest' | ||||
|           python.version: '3.11' | ||||
|       maxParallel: 4 | ||||
|     pool: | ||||
|       vmImage: $(imageName) | ||||
|     steps: | ||||
|       - template: .github/azure-steps.yml | ||||
|         parameters: | ||||
|           python_version: '$(python.version)' | ||||
|  | @ -1,6 +1,4 @@ | |||
| # build version constraints for use with wheelwright + multibuild | ||||
| numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64' | ||||
| numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64' | ||||
| numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' | ||||
| numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' | ||||
| numpy==1.19.3; python_version=='3.9' | ||||
|  |  | |||
|  | @ -9,3 +9,6 @@ requires = [ | |||
|     "numpy>=1.15.0", | ||||
| ] | ||||
| build-backend = "setuptools.build_meta" | ||||
| 
 | ||||
| [tool.isort] | ||||
| profile = "black" | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 | |||
| wasabi>=0.9.1,<1.2.0 | ||||
| srsly>=2.4.3,<3.0.0 | ||||
| catalogue>=2.0.6,<2.1.0 | ||||
| typer>=0.3.0,<0.8.0 | ||||
| typer>=0.3.0,<0.10.0 | ||||
| pathy>=0.10.0 | ||||
| smart-open>=5.2.1,<7.0.0 | ||||
| # Third party dependencies | ||||
|  | @ -30,10 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 | |||
| mock>=2.0.0,<3.0.0 | ||||
| flake8>=3.8.0,<6.0.0 | ||||
| hypothesis>=3.27.0,<7.0.0 | ||||
| mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" | ||||
| types-dataclasses>=0.1.3; python_version < "3.7" | ||||
| mypy>=0.990,<1.1.0; platform_machine != "aarch64" | ||||
| types-mock>=0.1.1 | ||||
| types-setuptools>=57.0.0 | ||||
| types-requests | ||||
| types-setuptools>=57.0.0 | ||||
| black==22.3.0 | ||||
| isort>=5.0,<6.0 | ||||
|  |  | |||
							
								
								
									
										46
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								setup.cfg
									
									
									
									
									
								
							|  | @ -30,6 +30,14 @@ project_urls = | |||
| zip_safe = false | ||||
| include_package_data = true | ||||
| python_requires = >=3.8 | ||||
| setup_requires = | ||||
|     cython>=0.25,<3.0 | ||||
|     numpy>=1.15.0 | ||||
|     # We also need our Cython packages here to compile against | ||||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     thinc>=9.0.0.dev2,<9.1.0 | ||||
| install_requires = | ||||
|     # Our libraries | ||||
|     spacy-legacy>=4.0.0.dev0,<4.1.0 | ||||
|  | @ -42,7 +50,7 @@ install_requires = | |||
|     srsly>=2.4.3,<3.0.0 | ||||
|     catalogue>=2.0.6,<2.1.0 | ||||
|     # Third-party dependencies | ||||
|     typer>=0.3.0,<0.8.0 | ||||
|     typer>=0.3.0,<0.10.0 | ||||
|     pathy>=0.10.0 | ||||
|     smart-open>=5.2.1,<7.0.0 | ||||
|     tqdm>=4.38.0,<5.0.0 | ||||
|  | @ -67,41 +75,41 @@ transformers = | |||
| ray = | ||||
|     spacy_ray>=0.1.0,<1.0.0 | ||||
| cuda = | ||||
|     cupy>=5.0.0b4,<12.0.0 | ||||
|     cupy>=5.0.0b4,<13.0.0 | ||||
| cuda80 = | ||||
|     cupy-cuda80>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda80>=5.0.0b4,<13.0.0 | ||||
| cuda90 = | ||||
|     cupy-cuda90>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda90>=5.0.0b4,<13.0.0 | ||||
| cuda91 = | ||||
|     cupy-cuda91>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda91>=5.0.0b4,<13.0.0 | ||||
| cuda92 = | ||||
|     cupy-cuda92>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda92>=5.0.0b4,<13.0.0 | ||||
| cuda100 = | ||||
|     cupy-cuda100>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda100>=5.0.0b4,<13.0.0 | ||||
| cuda101 = | ||||
|     cupy-cuda101>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda101>=5.0.0b4,<13.0.0 | ||||
| cuda102 = | ||||
|     cupy-cuda102>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda102>=5.0.0b4,<13.0.0 | ||||
| cuda110 = | ||||
|     cupy-cuda110>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda110>=5.0.0b4,<13.0.0 | ||||
| cuda111 = | ||||
|     cupy-cuda111>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda111>=5.0.0b4,<13.0.0 | ||||
| cuda112 = | ||||
|     cupy-cuda112>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda112>=5.0.0b4,<13.0.0 | ||||
| cuda113 = | ||||
|     cupy-cuda113>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda113>=5.0.0b4,<13.0.0 | ||||
| cuda114 = | ||||
|     cupy-cuda114>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda114>=5.0.0b4,<13.0.0 | ||||
| cuda115 = | ||||
|     cupy-cuda115>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda115>=5.0.0b4,<13.0.0 | ||||
| cuda116 = | ||||
|     cupy-cuda116>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda116>=5.0.0b4,<13.0.0 | ||||
| cuda117 = | ||||
|     cupy-cuda117>=5.0.0b4,<12.0.0 | ||||
|     cupy-cuda117>=5.0.0b4,<13.0.0 | ||||
| cuda11x = | ||||
|     cupy-cuda11x>=11.0.0,<12.0.0 | ||||
|     cupy-cuda11x>=11.0.0,<13.0.0 | ||||
| cuda-autodetect = | ||||
|     cupy-wheel>=11.0.0,<12.0.0 | ||||
|     cupy-wheel>=11.0.0,<13.0.0 | ||||
| apple = | ||||
|     thinc-apple-ops>=0.1.0.dev0,<1.0.0 | ||||
| # Language tokenizers with external dependencies | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from typing import Union, Iterable, Dict, Any | ||||
| from pathlib import Path | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, Union | ||||
| 
 | ||||
| # set library-specific custom warning handling before doing anything else | ||||
| from .errors import setup_default_warnings | ||||
|  | @ -8,20 +8,17 @@ from .errors import setup_default_warnings | |||
| setup_default_warnings()  # noqa: E402 | ||||
| 
 | ||||
| # These are imported as part of the API | ||||
| from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401 | ||||
| from thinc.api import Config | ||||
| from thinc.api import Config, prefer_gpu, require_cpu, require_gpu  # noqa: F401 | ||||
| 
 | ||||
| from . import pipeline  # noqa: F401 | ||||
| from .cli.info import info  # noqa: F401 | ||||
| from .glossary import explain  # noqa: F401 | ||||
| from .about import __version__  # noqa: F401 | ||||
| from .util import registry, logger  # noqa: F401 | ||||
| 
 | ||||
| from .errors import Errors | ||||
| from .language import Language | ||||
| from .vocab import Vocab | ||||
| from . import util | ||||
| 
 | ||||
| from .about import __version__  # noqa: F401 | ||||
| from .cli.info import info  # noqa: F401 | ||||
| from .errors import Errors | ||||
| from .glossary import explain  # noqa: F401 | ||||
| from .language import Language | ||||
| from .util import logger, registry  # noqa: F401 | ||||
| from .vocab import Vocab | ||||
| 
 | ||||
| if sys.maxunicode == 65535: | ||||
|     raise SystemError(Errors.E130) | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| # fmt: off | ||||
| __title__ = "spacy" | ||||
| __version__ = "4.0.0.dev0" | ||||
| __version__ = "4.0.0.dev1" | ||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||
| __projects__ = "https://github.com/explosion/projects" | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| from . cimport symbols | ||||
| 
 | ||||
| 
 | ||||
| cdef enum attr_id_t: | ||||
|     NULL_ATTR = 0 | ||||
|     IS_ALPHA = symbols.IS_ALPHA | ||||
|  |  | |||
|  | @ -1,35 +1,35 @@ | |||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, setup_cli  # noqa: F401 | ||||
| from .apply import apply  # noqa: F401 | ||||
| from .assemble import assemble_cli  # noqa: F401 | ||||
| 
 | ||||
| # These are the actual functions, NOT the wrapped CLI commands. The CLI commands | ||||
| # are registered automatically and won't have to be imported here. | ||||
| from .benchmark_speed import benchmark_speed_cli  # noqa: F401 | ||||
| from .download import download  # noqa: F401 | ||||
| from .info import info  # noqa: F401 | ||||
| from .package import package  # noqa: F401 | ||||
| from .profile import profile  # noqa: F401 | ||||
| from .train import train_cli  # noqa: F401 | ||||
| from .assemble import assemble_cli  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .debug_data import debug_data  # noqa: F401 | ||||
| from .debug_config import debug_config  # noqa: F401 | ||||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .debug_diff import debug_diff  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .apply import apply  # noqa: F401 | ||||
| from .convert import convert  # noqa: F401 | ||||
| from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||
| from .init_config import init_config, fill_config  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| from .project.clone import project_clone  # noqa: F401 | ||||
| from .project.assets import project_assets  # noqa: F401 | ||||
| from .project.run import project_run  # noqa: F401 | ||||
| from .project.dvc import project_update_dvc  # noqa: F401 | ||||
| from .project.push import project_push  # noqa: F401 | ||||
| from .project.pull import project_pull  # noqa: F401 | ||||
| from .project.document import project_document  # noqa: F401 | ||||
| from .debug_config import debug_config  # noqa: F401 | ||||
| from .debug_data import debug_data  # noqa: F401 | ||||
| from .debug_diff import debug_diff  # noqa: F401 | ||||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .download import download  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .find_threshold import find_threshold  # noqa: F401 | ||||
| from .info import info  # noqa: F401 | ||||
| from .init_config import fill_config, init_config  # noqa: F401 | ||||
| from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||
| from .package import package  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .profile import profile  # noqa: F401 | ||||
| from .project.assets import project_assets  # noqa: F401 | ||||
| from .project.clone import project_clone  # noqa: F401 | ||||
| from .project.document import project_document  # noqa: F401 | ||||
| from .project.dvc import project_update_dvc  # noqa: F401 | ||||
| from .project.pull import project_pull  # noqa: F401 | ||||
| from .project.push import project_push  # noqa: F401 | ||||
| from .project.run import project_run  # noqa: F401 | ||||
| from .train import train_cli  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||
|  |  | |||
|  | @ -1,26 +1,45 @@ | |||
| from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal | ||||
| from typing import TYPE_CHECKING, overload | ||||
| import sys | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from wasabi import msg, Printer | ||||
| import srsly | ||||
| import hashlib | ||||
| import os | ||||
| import shutil | ||||
| import sys | ||||
| from configparser import InterpolationError | ||||
| from contextlib import contextmanager | ||||
| from pathlib import Path | ||||
| from typing import ( | ||||
|     TYPE_CHECKING, | ||||
|     Any, | ||||
|     Dict, | ||||
|     Iterable, | ||||
|     List, | ||||
|     Literal, | ||||
|     Optional, | ||||
|     Tuple, | ||||
|     Union, | ||||
|     overload, | ||||
| ) | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from click import NoSuchOption | ||||
| from click.parser import split_arg_string | ||||
| from typer.main import get_command | ||||
| from contextlib import contextmanager | ||||
| from thinc.api import Config, ConfigValidationError, require_gpu | ||||
| from thinc.util import gpu_is_available | ||||
| from configparser import InterpolationError | ||||
| import os | ||||
| from typer.main import get_command | ||||
| from wasabi import Printer, msg | ||||
| 
 | ||||
| from ..schemas import ProjectConfigSchema, validate | ||||
| from ..util import import_file, run_command, make_tempdir, registry, logger | ||||
| from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS | ||||
| from ..errors import RENAMED_LANGUAGE_CODES | ||||
| from .. import about | ||||
| from ..errors import RENAMED_LANGUAGE_CODES | ||||
| from ..schemas import ProjectConfigSchema, validate | ||||
| from ..util import ( | ||||
|     ENV_VARS, | ||||
|     SimpleFrozenDict, | ||||
|     import_file, | ||||
|     is_compatible_version, | ||||
|     logger, | ||||
|     make_tempdir, | ||||
|     registry, | ||||
|     run_command, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pathy import FluidPath  # noqa: F401 | ||||
|  |  | |||
|  | @ -1,18 +1,15 @@ | |||
| import tqdm | ||||
| import srsly | ||||
| 
 | ||||
| from itertools import chain | ||||
| from pathlib import Path | ||||
| from typing import Optional, List, Iterable, cast, Union | ||||
| from typing import Iterable, List, Optional, Union, cast | ||||
| 
 | ||||
| import srsly | ||||
| import tqdm | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory | ||||
| 
 | ||||
| from ..tokens import Doc, DocBin | ||||
| from ..vocab import Vocab | ||||
| from ..util import ensure_path, load_model | ||||
| 
 | ||||
| from ..vocab import Vocab | ||||
| from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory | ||||
| 
 | ||||
| path_help = """Location of the documents to predict on. | ||||
| Can be a single file in .spacy format or a .jsonl file. | ||||
|  |  | |||
|  | @ -1,13 +1,20 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| import logging | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code_paths | ||||
| from .. import util | ||||
| from ..util import get_sourced_components, load_model_from_config | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     app, | ||||
|     import_code_paths, | ||||
|     parse_config_overrides, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  |  | |||
|  | @ -1,11 +1,12 @@ | |||
| from typing import Iterable, List, Optional | ||||
| import random | ||||
| from itertools import islice | ||||
| import numpy | ||||
| from pathlib import Path | ||||
| import time | ||||
| from tqdm import tqdm | ||||
| from itertools import islice | ||||
| from pathlib import Path | ||||
| from typing import Iterable, List, Optional | ||||
| 
 | ||||
| import numpy | ||||
| import typer | ||||
| from tqdm import tqdm | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .. import util | ||||
|  |  | |||
|  | @ -1,18 +1,22 @@ | |||
| from typing import Callable, Iterable, Mapping, Optional, Any, Union | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from wasabi import Printer | ||||
| import srsly | ||||
| import itertools | ||||
| import re | ||||
| import sys | ||||
| import itertools | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, Iterable, Mapping, Optional, Union | ||||
| 
 | ||||
| import srsly | ||||
| from wasabi import Printer | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory | ||||
| from ..training import docs_to_json | ||||
| from ..tokens import Doc, DocBin | ||||
| from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs | ||||
| from ..training.converters import conllu_to_docs | ||||
| 
 | ||||
| from ..training import docs_to_json | ||||
| from ..training.converters import ( | ||||
|     conll_ner_to_docs, | ||||
|     conllu_to_docs, | ||||
|     iob_to_docs, | ||||
|     json_to_docs, | ||||
| ) | ||||
| from ._util import Arg, Opt, _handle_renamed_language_codes, app, walk_directory | ||||
| 
 | ||||
| # Converters are matched by file extension except for ner/iob, which are | ||||
| # matched by file extension and content. To add a converter, add a new | ||||
|  |  | |||
|  | @ -1,15 +1,22 @@ | |||
| from typing import Optional, Dict, Any, Union, List | ||||
| from pathlib import Path | ||||
| from wasabi import msg, table | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| import typer | ||||
| from thinc.api import Config | ||||
| from thinc.config import VARIABLE_RE | ||||
| import typer | ||||
| from wasabi import msg, table | ||||
| 
 | ||||
| from ._util import Arg, Opt, show_validation_error, parse_config_overrides | ||||
| from ._util import import_code_paths, debug_cli | ||||
| from .. import util | ||||
| from ..schemas import ConfigSchemaInit, ConfigSchemaTraining | ||||
| from ..util import registry | ||||
| from .. import util | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     debug_cli, | ||||
|     import_code_paths, | ||||
|     parse_config_overrides, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command( | ||||
|  |  | |||
|  | @ -1,29 +1,49 @@ | |||
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union | ||||
| from typing import Literal, cast, overload | ||||
| from pathlib import Path | ||||
| from collections import Counter | ||||
| import sys | ||||
| import srsly | ||||
| from wasabi import Printer, MESSAGES, msg | ||||
| import typer | ||||
| import math | ||||
| import sys | ||||
| from collections import Counter | ||||
| from pathlib import Path | ||||
| from typing import ( | ||||
|     Any, | ||||
|     Dict, | ||||
|     Iterable, | ||||
|     List, | ||||
|     Literal, | ||||
|     Optional, | ||||
|     Sequence, | ||||
|     Set, | ||||
|     Tuple, | ||||
|     Union, | ||||
|     cast, | ||||
|     overload, | ||||
| ) | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides | ||||
| from ._util import import_code_paths, debug_cli, _format_number | ||||
| from ..training import Example, remove_bilu_prefix | ||||
| from ..training.initialize import get_sourced_components | ||||
| from ..schemas import ConfigSchemaTraining | ||||
| from ..pipeline import TrainablePipe | ||||
| import numpy | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import MESSAGES, Printer, msg | ||||
| 
 | ||||
| from .. import util | ||||
| from ..language import Language | ||||
| from ..morphology import Morphology | ||||
| from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe | ||||
| from ..pipeline._edit_tree_internals.edit_trees import EditTrees | ||||
| from ..pipeline._parser_internals import nonproj | ||||
| from ..pipeline._parser_internals.nonproj import DELIMITER | ||||
| from ..pipeline import Morphologizer, SpanCategorizer | ||||
| from ..pipeline._edit_tree_internals.edit_trees import EditTrees | ||||
| from ..morphology import Morphology | ||||
| from ..language import Language | ||||
| from ..schemas import ConfigSchemaTraining | ||||
| from ..training import Example, remove_bilu_prefix | ||||
| from ..training.initialize import get_sourced_components | ||||
| from ..util import registry, resolve_dot_names | ||||
| from ..vectors import Mode as VectorsMode | ||||
| from .. import util | ||||
| 
 | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     _format_number, | ||||
|     app, | ||||
|     debug_cli, | ||||
|     import_code_paths, | ||||
|     parse_config_overrides, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| # Minimum number of expected occurrences of NER label in data to train new label | ||||
| NEW_LABEL_THRESHOLD = 50 | ||||
|  | @ -210,7 +230,7 @@ def debug_data( | |||
|     else: | ||||
|         msg.info("No word vectors present in the package") | ||||
| 
 | ||||
|     if "spancat" in factory_names: | ||||
|     if "spancat" in factory_names or "spancat_singlelabel" in factory_names: | ||||
|         model_labels_spancat = _get_labels_from_spancat(nlp) | ||||
|         has_low_data_warning = False | ||||
|         has_no_neg_warning = False | ||||
|  | @ -335,7 +355,7 @@ def debug_data( | |||
|                 show=verbose, | ||||
|             ) | ||||
|         else: | ||||
|             msg.good("Examples without ocurrences available for all labels") | ||||
|             msg.good("Examples without occurrences available for all labels") | ||||
| 
 | ||||
|     if "ner" in factory_names: | ||||
|         # Get all unique NER labels present in the data | ||||
|  | @ -520,9 +540,13 @@ def debug_data( | |||
| 
 | ||||
|     if "tagger" in factory_names: | ||||
|         msg.divider("Part-of-speech Tagging") | ||||
|         label_list = [label for label in gold_train_data["tags"]] | ||||
|         model_labels = _get_labels_from_model(nlp, "tagger") | ||||
|         label_list, counts = zip(*gold_train_data["tags"].items()) | ||||
|         msg.info(f"{len(label_list)} label(s) in train data") | ||||
|         p = numpy.array(counts) | ||||
|         p = p / p.sum() | ||||
|         norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list)) | ||||
|         msg.info(f"{norm_entropy} is the normalised label entropy") | ||||
|         model_labels = _get_labels_from_model(nlp, "tagger") | ||||
|         labels = set(label_list) | ||||
|         missing_labels = model_labels - labels | ||||
|         if missing_labels: | ||||
|  | @ -824,7 +848,7 @@ def _compile_gold( | |||
|                     data["boundary_cross_ents"] += 1 | ||||
|                 elif label == "-": | ||||
|                     data["ner"]["-"] += 1 | ||||
|         if "spancat" in factory_names: | ||||
|         if "spancat" in factory_names or "spancat_singlelabel" in factory_names: | ||||
|             for spans_key in list(eg.reference.spans.keys()): | ||||
|                 # Obtain the span frequency | ||||
|                 if spans_key not in data["spancat"]: | ||||
|  | @ -1022,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: | |||
|     pipe_names = [ | ||||
|         pipe_name | ||||
|         for pipe_name in nlp.pipe_names | ||||
|         if nlp.get_pipe_meta(pipe_name).factory == "spancat" | ||||
|         if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel") | ||||
|     ] | ||||
|     labels: Dict[str, Set[str]] = {} | ||||
|     for pipe_name in pipe_names: | ||||
|  |  | |||
|  | @ -1,13 +1,13 @@ | |||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import Printer, diff_strings, MarkdownRenderer | ||||
| from pathlib import Path | ||||
| from thinc.api import Config | ||||
| from wasabi import MarkdownRenderer, Printer, diff_strings | ||||
| 
 | ||||
| from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides | ||||
| from ..util import load_config | ||||
| from .init_config import init_config, Optimizations | ||||
| from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error | ||||
| from .init_config import Optimizations, init_config | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command( | ||||
|  |  | |||
|  | @ -1,19 +1,32 @@ | |||
| from typing import Dict, Any, Optional | ||||
| from pathlib import Path | ||||
| import itertools | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional | ||||
| 
 | ||||
| import typer | ||||
| from thinc.api import ( | ||||
|     Model, | ||||
|     data_validation, | ||||
|     fix_random_seed, | ||||
|     set_dropout_rate, | ||||
|     set_gpu_allocator, | ||||
| ) | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from spacy.training import Example | ||||
| from spacy.util import resolve_dot_names | ||||
| from wasabi import msg | ||||
| from thinc.api import fix_random_seed, set_dropout_rate | ||||
| from thinc.api import Model, data_validation, set_gpu_allocator | ||||
| import typer | ||||
| 
 | ||||
| from ._util import Arg, Opt, debug_cli, show_validation_error | ||||
| from ._util import parse_config_overrides, string_to_list, setup_gpu | ||||
| from .. import util | ||||
| from ..schemas import ConfigSchemaTraining | ||||
| from ..util import registry | ||||
| from .. import util | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     debug_cli, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
|     string_to_list, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command( | ||||
|  |  | |||
|  | @ -1,14 +1,20 @@ | |||
| from typing import Optional, Sequence | ||||
| import requests | ||||
| import sys | ||||
| from wasabi import msg | ||||
| import typer | ||||
| from typing import Optional, Sequence | ||||
| 
 | ||||
| import requests | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX | ||||
| from .. import about | ||||
| from ..util import is_package, get_minor_version, run_command | ||||
| from ..util import is_prerelease_version, get_installed_models | ||||
| from ..util import get_package_version | ||||
| from ..util import ( | ||||
|     get_installed_models, | ||||
|     get_minor_version, | ||||
|     get_package_version, | ||||
|     is_package, | ||||
|     is_prerelease_version, | ||||
|     run_command, | ||||
| ) | ||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  | @ -83,11 +89,8 @@ def download( | |||
| 
 | ||||
| def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | ||||
|     dl_tpl = "{m}-{v}/{m}-{v}{s}" | ||||
|     egg_tpl = "#egg={m}=={v}" | ||||
|     suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX | ||||
|     filename = dl_tpl.format(m=model_name, v=version, s=suffix) | ||||
|     if sdist: | ||||
|         filename += egg_tpl.format(m=model_name, v=version) | ||||
|     return filename | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,16 +1,16 @@ | |||
| from typing import Optional, List, Dict, Any, Union | ||||
| from wasabi import Printer | ||||
| from pathlib import Path | ||||
| import re | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| import srsly | ||||
| from thinc.api import fix_random_seed | ||||
| from wasabi import Printer | ||||
| 
 | ||||
| from ..training import Corpus | ||||
| from ..tokens import Doc | ||||
| from ._util import app, Arg, Opt, setup_gpu, import_code_paths, benchmark_cli | ||||
| from .. import displacy, util | ||||
| from ..scorer import Scorer | ||||
| from .. import util | ||||
| from .. import displacy | ||||
| from ..tokens import Doc | ||||
| from ..training import Corpus | ||||
| from ._util import Arg, Opt, app, benchmark_cli, import_code_paths, setup_gpu | ||||
| 
 | ||||
| 
 | ||||
| @benchmark_cli.command( | ||||
|  | @ -27,6 +27,7 @@ def evaluate_cli( | |||
|     gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), | ||||
|     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), | ||||
|     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), | ||||
|     per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|  | @ -50,6 +51,7 @@ def evaluate_cli( | |||
|         gold_preproc=gold_preproc, | ||||
|         displacy_path=displacy_path, | ||||
|         displacy_limit=displacy_limit, | ||||
|         per_component=per_component, | ||||
|         silent=False, | ||||
|     ) | ||||
| 
 | ||||
|  | @ -64,6 +66,7 @@ def evaluate( | |||
|     displacy_limit: int = 25, | ||||
|     silent: bool = True, | ||||
|     spans_key: str = "sc", | ||||
|     per_component: bool = False, | ||||
| ) -> Dict[str, Any]: | ||||
|     msg = Printer(no_print=silent, pretty=not silent) | ||||
|     fix_random_seed() | ||||
|  | @ -78,7 +81,16 @@ def evaluate( | |||
|     corpus = Corpus(data_path, gold_preproc=gold_preproc) | ||||
|     nlp = util.load_model(model) | ||||
|     dev_dataset = list(corpus(nlp)) | ||||
|     scores = nlp.evaluate(dev_dataset) | ||||
|     scores = nlp.evaluate(dev_dataset, per_component=per_component) | ||||
|     if per_component: | ||||
|         data = scores | ||||
|         if output is None: | ||||
|             msg.warn( | ||||
|                 "The per-component option is enabled but there is no output JSON file provided to save the scores to." | ||||
|             ) | ||||
|         else: | ||||
|             msg.info("Per-component scores will be saved to output JSON file.") | ||||
|     else: | ||||
|         metrics = { | ||||
|             "TOK": "token_acc", | ||||
|             "TAG": "tag_acc", | ||||
|  | @ -122,6 +134,8 @@ def evaluate( | |||
|         docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) | ||||
|         render_deps = "parser" in factory_names | ||||
|         render_ents = "ner" in factory_names | ||||
|         render_spans = "spancat" in factory_names | ||||
| 
 | ||||
|         render_parses( | ||||
|             docs, | ||||
|             displacy_path, | ||||
|  | @ -129,6 +143,7 @@ def evaluate( | |||
|             limit=displacy_limit, | ||||
|             deps=render_deps, | ||||
|             ents=render_ents, | ||||
|             spans=render_spans, | ||||
|         ) | ||||
|         msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) | ||||
| 
 | ||||
|  | @ -182,6 +197,7 @@ def render_parses( | |||
|     limit: int = 250, | ||||
|     deps: bool = True, | ||||
|     ents: bool = True, | ||||
|     spans: bool = True, | ||||
| ): | ||||
|     docs[0].user_data["title"] = model_name | ||||
|     if ents: | ||||
|  | @ -195,6 +211,11 @@ def render_parses( | |||
|         with (output_path / "parses.html").open("w", encoding="utf8") as file_: | ||||
|             file_.write(html) | ||||
| 
 | ||||
|     if spans: | ||||
|         html = displacy.render(docs[:limit], style="span", page=True) | ||||
|         with (output_path / "spans.html").open("w", encoding="utf8") as file_: | ||||
|             file_.write(html) | ||||
| 
 | ||||
| 
 | ||||
| def print_prf_per_type( | ||||
|     msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str | ||||
|  |  | |||
|  | @ -1,17 +1,17 @@ | |||
| import functools | ||||
| import logging | ||||
| import operator | ||||
| from pathlib import Path | ||||
| import logging | ||||
| from typing import Optional, Tuple, Any, Dict, List | ||||
| from typing import Any, Dict, List, Optional, Tuple | ||||
| 
 | ||||
| import numpy | ||||
| import wasabi.tables | ||||
| 
 | ||||
| from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer | ||||
| from ..errors import Errors | ||||
| from ..training import Corpus | ||||
| from ._util import app, Arg, Opt, import_code, setup_gpu | ||||
| from .. import util | ||||
| from ..errors import Errors | ||||
| from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer | ||||
| from ..training import Corpus | ||||
| from ._util import Arg, Opt, app, import_code, setup_gpu | ||||
| 
 | ||||
| _DEFAULTS = { | ||||
|     "n_trials": 11, | ||||
|  | @ -35,7 +35,7 @@ def find_threshold_cli( | |||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|     gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"), | ||||
|     verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|  |  | |||
|  | @ -1,15 +1,15 @@ | |||
| from typing import Optional, Dict, Any, Union, List | ||||
| import platform | ||||
| import json | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, MarkdownRenderer | ||||
| import srsly | ||||
| import importlib.metadata | ||||
| import json | ||||
| import platform | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional, Union | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, string_to_list | ||||
| from .download import get_model_filename, get_latest_version | ||||
| from .. import util | ||||
| from .. import about | ||||
| import srsly | ||||
| from wasabi import MarkdownRenderer, Printer | ||||
| 
 | ||||
| from .. import about, util | ||||
| from ._util import Arg, Opt, app, string_to_list | ||||
| from .download import get_latest_version, get_model_filename | ||||
| 
 | ||||
| 
 | ||||
| @app.command("info") | ||||
|  |  | |||
|  | @ -1,19 +1,27 @@ | |||
| from typing import Optional, List, Tuple | ||||
| import re | ||||
| from enum import Enum | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, diff_strings | ||||
| from thinc.api import Config | ||||
| from typing import List, Optional, Tuple | ||||
| 
 | ||||
| import srsly | ||||
| import re | ||||
| from jinja2 import Template | ||||
| from thinc.api import Config | ||||
| from wasabi import Printer, diff_strings | ||||
| 
 | ||||
| from .. import util | ||||
| from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH | ||||
| from ..schemas import RecommendationSchema | ||||
| from ..util import SimpleFrozenList | ||||
| from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND | ||||
| from ._util import string_to_list, import_code, _handle_renamed_language_codes | ||||
| 
 | ||||
| from ._util import ( | ||||
|     COMMAND, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     _handle_renamed_language_codes, | ||||
|     import_code, | ||||
|     init_cli, | ||||
|     show_validation_error, | ||||
|     string_to_list, | ||||
| ) | ||||
| 
 | ||||
| ROOT = Path(__file__).parent / "templates" | ||||
| TEMPLATE_PATH = ROOT / "quickstart_training.jinja" | ||||
|  |  | |||
|  | @ -1,15 +1,24 @@ | |||
| from typing import Optional | ||||
| import logging | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| from typing import Optional | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .. import util | ||||
| from ..training.initialize import init_nlp, convert_vectors | ||||
| from ..language import Language | ||||
| from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code, setup_gpu, _handle_renamed_language_codes | ||||
| from ..training.initialize import convert_vectors, init_nlp | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     _handle_renamed_language_codes, | ||||
|     import_code, | ||||
|     init_cli, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @init_cli.command("vectors") | ||||
|  |  | |||
|  | @ -1,18 +1,18 @@ | |||
| from typing import Optional, Union, Any, Dict, List, Tuple, cast | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, MarkdownRenderer, get_raw_input | ||||
| from thinc.api import Config | ||||
| from collections import defaultdict | ||||
| from catalogue import RegistryError | ||||
| import srsly | ||||
| import sys | ||||
| import re | ||||
| import shutil | ||||
| import sys | ||||
| from collections import defaultdict | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional, Tuple, Union, cast | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX | ||||
| from ..schemas import validate, ModelMetaSchema | ||||
| from .. import util | ||||
| from .. import about | ||||
| import srsly | ||||
| from catalogue import RegistryError | ||||
| from thinc.api import Config | ||||
| from wasabi import MarkdownRenderer, Printer, get_raw_input | ||||
| 
 | ||||
| from .. import about, util | ||||
| from ..schemas import ModelMetaSchema, validate | ||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list | ||||
| 
 | ||||
| 
 | ||||
| @app.command("package") | ||||
|  |  | |||
|  | @ -1,13 +1,21 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| import re | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code_paths, setup_gpu | ||||
| from ..training.pretrain import pretrain | ||||
| from ..util import load_config | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     app, | ||||
|     import_code_paths, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  | @ -23,6 +31,7 @@ def pretrain_cli( | |||
|     resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), | ||||
|     epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), | ||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|     skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|  | @ -74,6 +83,7 @@ def pretrain_cli( | |||
|         epoch_resume=epoch_resume, | ||||
|         use_gpu=use_gpu, | ||||
|         silent=False, | ||||
|         skip_last=skip_last, | ||||
|     ) | ||||
|     msg.good("Successfully finished pretrain") | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,17 +1,18 @@ | |||
| from typing import Optional, Sequence, Union, Iterator | ||||
| import tqdm | ||||
| from pathlib import Path | ||||
| import srsly | ||||
| import cProfile | ||||
| import itertools | ||||
| import pstats | ||||
| import sys | ||||
| import itertools | ||||
| from wasabi import msg, Printer | ||||
| import typer | ||||
| from pathlib import Path | ||||
| from typing import Iterator, Optional, Sequence, Union | ||||
| 
 | ||||
| import srsly | ||||
| import tqdm | ||||
| import typer | ||||
| from wasabi import Printer, msg | ||||
| 
 | ||||
| from ._util import app, debug_cli, Arg, Opt, NAME | ||||
| from ..language import Language | ||||
| from ..util import load_model | ||||
| from ._util import NAME, Arg, Opt, app, debug_cli | ||||
| 
 | ||||
| 
 | ||||
| @debug_cli.command("profile") | ||||
|  |  | |||
|  | @ -1,16 +1,27 @@ | |||
| from typing import Any, Dict, Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional | ||||
| 
 | ||||
| import requests | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ...util import ensure_path, working_dir | ||||
| from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config | ||||
| from .._util import get_checksum, download_file, git_checkout, get_git_version | ||||
| from .._util import SimpleFrozenDict, parse_config_overrides | ||||
| from .._util import ( | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     SimpleFrozenDict, | ||||
|     download_file, | ||||
|     get_checksum, | ||||
|     get_git_version, | ||||
|     git_checkout, | ||||
|     load_project_config, | ||||
|     parse_config_overrides, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| # Whether assets are extra if `extra` is not set. | ||||
| EXTRA_DEFAULT = False | ||||
|  |  | |||
|  | @ -1,13 +1,22 @@ | |||
| from typing import Optional | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import subprocess | ||||
| import re | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ... import about | ||||
| from ...util import ensure_path | ||||
| from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE | ||||
| from .._util import git_checkout, get_git_version, git_repo_branch_exists | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_git_version, | ||||
|     git_checkout, | ||||
|     git_repo_branch_exists, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| DEFAULT_REPO = about.__projects__ | ||||
| DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from pathlib import Path | ||||
| from wasabi import msg, MarkdownRenderer | ||||
| 
 | ||||
| from wasabi import MarkdownRenderer, msg | ||||
| 
 | ||||
| from ...util import working_dir | ||||
| from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config | ||||
| 
 | ||||
| from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli | ||||
| 
 | ||||
| DOCS_URL = "https://spacy.io" | ||||
| INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the | ||||
|  |  | |||
|  | @ -1,15 +1,28 @@ | |||
| """This module contains helpers and subcommands for integrating spaCy projects | ||||
| with Data Version Controk (DVC). https://dvc.org""" | ||||
| from typing import Dict, Any, List, Optional, Iterable | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, List, Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli | ||||
| from .._util import Arg, Opt, NAME, COMMAND | ||||
| from ...util import working_dir, split_command, join_command, run_command | ||||
| from ...util import SimpleFrozenList | ||||
| 
 | ||||
| from ...util import ( | ||||
|     SimpleFrozenList, | ||||
|     join_command, | ||||
|     run_command, | ||||
|     split_command, | ||||
|     working_dir, | ||||
| ) | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     NAME, | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_hash, | ||||
|     load_project_config, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| DVC_CONFIG = "dvc.yaml" | ||||
| DVC_DIR = ".dvc" | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| from .remote_storage import RemoteStorage | ||||
| from .remote_storage import get_command_hash | ||||
| from .._util import project_cli, Arg, logger | ||||
| from .._util import load_project_config | ||||
| 
 | ||||
| from .._util import Arg, load_project_config, logger, project_cli | ||||
| from .remote_storage import RemoteStorage, get_command_hash | ||||
| from .run import update_lockfile | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| from .remote_storage import RemoteStorage | ||||
| from .remote_storage import get_content_hash, get_command_hash | ||||
| from .._util import load_project_config | ||||
| from .._util import project_cli, Arg, logger | ||||
| 
 | ||||
| from .._util import Arg, load_project_config, logger, project_cli | ||||
| from .remote_storage import RemoteStorage, get_command_hash, get_content_hash | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("push") | ||||
|  |  | |||
|  | @ -1,18 +1,25 @@ | |||
| from typing import Optional, List, Dict, TYPE_CHECKING | ||||
| import hashlib | ||||
| import os | ||||
| import site | ||||
| import hashlib | ||||
| import urllib.parse | ||||
| import tarfile | ||||
| import urllib.parse | ||||
| from pathlib import Path | ||||
| from typing import TYPE_CHECKING, Dict, List, Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .._util import get_hash, get_checksum, upload_file, download_file | ||||
| from .._util import ensure_pathy, make_tempdir | ||||
| from ...util import get_minor_version, ENV_VARS, check_bool_env_var | ||||
| from ...git_info import GIT_VERSION | ||||
| from ... import about | ||||
| from ...errors import Errors | ||||
| from ...git_info import GIT_VERSION | ||||
| from ...util import ENV_VARS, check_bool_env_var, get_minor_version | ||||
| from .._util import ( | ||||
|     download_file, | ||||
|     ensure_pathy, | ||||
|     get_checksum, | ||||
|     get_hash, | ||||
|     make_tempdir, | ||||
|     upload_file, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pathy import FluidPath  # noqa: F401 | ||||
|  |  | |||
|  | @ -1,20 +1,39 @@ | |||
| from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple | ||||
| import os.path | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| from wasabi.util import locale_escape | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import msg | ||||
| from wasabi.util import locale_escape | ||||
| 
 | ||||
| from ... import about | ||||
| from ...git_info import GIT_VERSION | ||||
| from ...util import working_dir, run_command, split_command, is_cwd, join_command | ||||
| from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS | ||||
| from ...util import check_bool_env_var, SimpleFrozenDict | ||||
| from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash | ||||
| from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides | ||||
| from ...util import ( | ||||
|     ENV_VARS, | ||||
|     SimpleFrozenDict, | ||||
|     SimpleFrozenList, | ||||
|     check_bool_env_var, | ||||
|     is_cwd, | ||||
|     is_minor_version_match, | ||||
|     join_command, | ||||
|     run_command, | ||||
|     split_command, | ||||
|     working_dir, | ||||
| ) | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     PROJECT_FILE, | ||||
|     PROJECT_LOCK, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_checksum, | ||||
|     get_hash, | ||||
|     load_project_config, | ||||
|     parse_config_overrides, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command( | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and | |||
| can help generate the best possible configuration, given a user's requirements. #} | ||||
| {%- set use_transformer = hardware != "cpu" and transformer_data -%} | ||||
| {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} | ||||
| {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} | ||||
| {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%} | ||||
| [paths] | ||||
| train = null | ||||
| dev = null | ||||
|  | @ -24,8 +24,11 @@ gpu_allocator = null | |||
| lang = "{{ lang }}" | ||||
| {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} | ||||
| {%- set with_accuracy = optimize == "accuracy" -%} | ||||
| {%- set has_accurate_textcat = has_textcat and with_accuracy -%} | ||||
| {%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} | ||||
| {# The BOW textcat doesn't need a source of features, so it can omit the | ||||
| tok2vec/transformer. #} | ||||
| {%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} | ||||
| {%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} | ||||
| {%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} | ||||
| {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} | ||||
| {%- else -%} | ||||
| {%- set full_pipeline = components -%} | ||||
|  | @ -122,6 +125,30 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| {% endif -%} | ||||
| 
 | ||||
| {% if "span_finder" in components -%} | ||||
| [components.span_finder] | ||||
| factory = "span_finder" | ||||
| max_length = null | ||||
| min_length = null | ||||
| scorer = {"@scorers":"spacy.span_finder_scorer.v1"} | ||||
| spans_key = "sc" | ||||
| threshold = 0.5 | ||||
| 
 | ||||
| [components.span_finder.model] | ||||
| @architectures = "spacy.SpanFinder.v1" | ||||
| 
 | ||||
| [components.span_finder.model.scorer] | ||||
| @layers = "spacy.LinearLogistic.v1" | ||||
| nO = 2 | ||||
| 
 | ||||
| [components.span_finder.model.tok2vec] | ||||
| @architectures = "spacy-transformers.TransformerListener.v1" | ||||
| grad_factor = 1.0 | ||||
| 
 | ||||
| [components.span_finder.model.tok2vec.pooling] | ||||
| @layers = "reduce_mean.v1" | ||||
| {% endif -%} | ||||
| 
 | ||||
| {% if "spancat" in components -%} | ||||
| [components.spancat] | ||||
| factory = "spancat" | ||||
|  | @ -154,6 +181,36 @@ grad_factor = 1.0 | |||
| sizes = [1,2,3] | ||||
| {% endif -%} | ||||
| 
 | ||||
| {% if "spancat_singlelabel" in components %} | ||||
| [components.spancat_singlelabel] | ||||
| factory = "spancat_singlelabel" | ||||
| negative_weight = 1.0 | ||||
| allow_overlap = true | ||||
| scorer = {"@scorers":"spacy.spancat_scorer.v1"} | ||||
| spans_key = "sc" | ||||
| 
 | ||||
| [components.spancat_singlelabel.model] | ||||
| @architectures = "spacy.SpanCategorizer.v1" | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.reducer] | ||||
| @layers = "spacy.mean_max_reducer.v1" | ||||
| hidden_size = 128 | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.scorer] | ||||
| @layers = "Softmax.v2" | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.tok2vec] | ||||
| @architectures = "spacy-transformers.TransformerListener.v1" | ||||
| grad_factor = 1.0 | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.tok2vec.pooling] | ||||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.spancat_singlelabel.suggester] | ||||
| @misc = "spacy.ngram_suggester.v1" | ||||
| sizes = [1,2,3] | ||||
| {% endif %} | ||||
| 
 | ||||
| {% if "trainable_lemmatizer" in components -%} | ||||
| [components.trainable_lemmatizer] | ||||
| factory = "trainable_lemmatizer" | ||||
|  | @ -219,10 +276,16 @@ no_output_layer = false | |||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatCNN.v2" | ||||
| exclusive_classes = true | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| nO = null | ||||
| 
 | ||||
| [components.textcat.model.tok2vec] | ||||
| @architectures = "spacy-transformers.TransformerListener.v1" | ||||
| grad_factor = 1.0 | ||||
| 
 | ||||
| [components.textcat.model.tok2vec.pooling] | ||||
| @layers = "reduce_mean.v1" | ||||
| {%- endif %} | ||||
| {%- endif %} | ||||
| 
 | ||||
|  | @ -250,10 +313,16 @@ no_output_layer = false | |||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat_multilabel.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatCNN.v2" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| nO = null | ||||
| 
 | ||||
| [components.textcat_multilabel.model.tok2vec] | ||||
| @architectures = "spacy-transformers.TransformerListener.v1" | ||||
| grad_factor = 1.0 | ||||
| 
 | ||||
| [components.textcat_multilabel.model.tok2vec.pooling] | ||||
| @layers = "reduce_mean.v1" | ||||
| {%- endif %} | ||||
| {%- endif %} | ||||
| 
 | ||||
|  | @ -284,6 +353,7 @@ maxout_pieces = 3 | |||
| {% if "morphologizer" in components %} | ||||
| [components.morphologizer] | ||||
| factory = "morphologizer" | ||||
| label_smoothing = 0.05 | ||||
| 
 | ||||
| [components.morphologizer.model] | ||||
| @architectures = "spacy.Tagger.v2" | ||||
|  | @ -297,6 +367,7 @@ width = ${components.tok2vec.model.encode.width} | |||
| {% if "tagger" in components %} | ||||
| [components.tagger] | ||||
| factory = "tagger" | ||||
| label_smoothing = 0.05 | ||||
| 
 | ||||
| [components.tagger.model] | ||||
| @architectures = "spacy.Tagger.v2" | ||||
|  | @ -341,6 +412,27 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| {% endif %} | ||||
| 
 | ||||
| {% if "span_finder" in components %} | ||||
| [components.span_finder] | ||||
| factory = "span_finder" | ||||
| max_length = null | ||||
| min_length = null | ||||
| scorer = {"@scorers":"spacy.span_finder_scorer.v1"} | ||||
| spans_key = "sc" | ||||
| threshold = 0.5 | ||||
| 
 | ||||
| [components.span_finder.model] | ||||
| @architectures = "spacy.SpanFinder.v1" | ||||
| 
 | ||||
| [components.span_finder.model.scorer] | ||||
| @layers = "spacy.LinearLogistic.v1" | ||||
| nO = 2 | ||||
| 
 | ||||
| [components.span_finder.model.tok2vec] | ||||
| @architectures = "spacy.Tok2VecListener.v1" | ||||
| width = ${components.tok2vec.model.encode.width} | ||||
| {% endif %} | ||||
| 
 | ||||
| {% if "spancat" in components %} | ||||
| [components.spancat] | ||||
| factory = "spancat" | ||||
|  | @ -370,6 +462,33 @@ width = ${components.tok2vec.model.encode.width} | |||
| sizes = [1,2,3] | ||||
| {% endif %} | ||||
| 
 | ||||
| {% if "spancat_singlelabel" in components %} | ||||
| [components.spancat_singlelabel] | ||||
| factory = "spancat_singlelabel" | ||||
| negative_weight = 1.0 | ||||
| allow_overlap = true | ||||
| scorer = {"@scorers":"spacy.spancat_scorer.v1"} | ||||
| spans_key = "sc" | ||||
| 
 | ||||
| [components.spancat_singlelabel.model] | ||||
| @architectures = "spacy.SpanCategorizer.v1" | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.reducer] | ||||
| @layers = "spacy.mean_max_reducer.v1" | ||||
| hidden_size = 128 | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.scorer] | ||||
| @layers = "Softmax.v2" | ||||
| 
 | ||||
| [components.spancat_singlelabel.model.tok2vec] | ||||
| @architectures = "spacy.Tok2VecListener.v1" | ||||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.spancat_singlelabel.suggester] | ||||
| @misc = "spacy.ngram_suggester.v1" | ||||
| sizes = [1,2,3] | ||||
| {% endif %} | ||||
| 
 | ||||
| {% if "trainable_lemmatizer" in components -%} | ||||
| [components.trainable_lemmatizer] | ||||
| factory = "trainable_lemmatizer" | ||||
|  |  | |||
|  | @ -1,15 +1,23 @@ | |||
| from typing import Optional, Dict, Any, Union | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
| import typer | ||||
| import logging | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional, Union | ||||
| 
 | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error | ||||
| from ._util import import_code_paths, setup_gpu | ||||
| from ..training.loop import train as train_nlp | ||||
| from ..training.initialize import init_nlp | ||||
| from .. import util | ||||
| from ..training.initialize import init_nlp | ||||
| from ..training.loop import train as train_nlp | ||||
| from ._util import ( | ||||
|     Arg, | ||||
|     Opt, | ||||
|     app, | ||||
|     import_code_paths, | ||||
|     parse_config_overrides, | ||||
|     setup_gpu, | ||||
|     show_validation_error, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @app.command( | ||||
|  |  | |||
|  | @ -1,14 +1,21 @@ | |||
| from typing import Tuple | ||||
| from pathlib import Path | ||||
| import sys | ||||
| import requests | ||||
| from wasabi import msg, Printer | ||||
| import warnings | ||||
| from pathlib import Path | ||||
| from typing import Tuple | ||||
| 
 | ||||
| import requests | ||||
| from wasabi import Printer, msg | ||||
| 
 | ||||
| from ._util import app | ||||
| from .. import about | ||||
| from ..util import get_package_version, get_installed_models, get_minor_version | ||||
| from ..util import get_package_path, get_model_meta, is_compatible_version | ||||
| from ..util import ( | ||||
|     get_installed_models, | ||||
|     get_minor_version, | ||||
|     get_model_meta, | ||||
|     get_package_path, | ||||
|     get_package_version, | ||||
|     is_compatible_version, | ||||
| ) | ||||
| from ._util import app | ||||
| 
 | ||||
| 
 | ||||
| @app.command("validate") | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| """Helpers for Python and platform compatibility.""" | ||||
| import sys | ||||
| 
 | ||||
| from thinc.util import copy_array | ||||
| 
 | ||||
| try: | ||||
|  |  | |||
|  | @ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities. | |||
| DOCS: https://spacy.io/api/top-level#displacy | ||||
| USAGE: https://spacy.io/usage/visualizers | ||||
| """ | ||||
| from typing import Union, Iterable, Optional, Dict, Any, Callable | ||||
| import warnings | ||||
| from typing import Any, Callable, Dict, Iterable, Optional, Union | ||||
| 
 | ||||
| from .render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||
| from ..tokens import Doc, Span | ||||
| from ..errors import Errors, Warnings | ||||
| from ..util import is_in_jupyter | ||||
| from ..util import find_available_port | ||||
| 
 | ||||
| from ..tokens import Doc, Span | ||||
| from ..util import find_available_port, is_in_jupyter | ||||
| from .render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||
| 
 | ||||
| _html = {} | ||||
| RENDER_WRAPPER = None | ||||
|  | @ -68,7 +66,7 @@ def render( | |||
|     if jupyter or (jupyter is None and is_in_jupyter()): | ||||
|         # return HTML rendered by IPython display() | ||||
|         # See #4840 for details on span wrapper to disable mathjax | ||||
|         from IPython.core.display import display, HTML | ||||
|         from IPython.core.display import HTML, display | ||||
| 
 | ||||
|         return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html))) | ||||
|     return html | ||||
|  | @ -125,13 +123,17 @@ def app(environ, start_response): | |||
|     return [res] | ||||
| 
 | ||||
| 
 | ||||
| def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: | ||||
| def parse_deps( | ||||
|     orig_doc: Union[Doc, Span], options: Dict[str, Any] = {} | ||||
| ) -> Dict[str, Any]: | ||||
|     """Generate dependency parse in {'words': [], 'arcs': []} format. | ||||
| 
 | ||||
|     orig_doc (Doc): Document to parse. | ||||
|     orig_doc (Union[Doc, Span]): Document to parse. | ||||
|     options (Dict[str, Any]): Dependency parse specific visualisation options. | ||||
|     RETURNS (dict): Generated dependency parse keyed by words and arcs. | ||||
|     """ | ||||
|     if isinstance(orig_doc, Span): | ||||
|         orig_doc = orig_doc.as_doc() | ||||
|     doc = Doc(orig_doc.vocab).from_bytes( | ||||
|         orig_doc.to_bytes(exclude=["user_data", "user_hooks"]) | ||||
|     ) | ||||
|  |  | |||
|  | @ -1,15 +1,29 @@ | |||
| from typing import Any, Dict, List, Optional, Tuple, Union | ||||
| import uuid | ||||
| import itertools | ||||
| import uuid | ||||
| from typing import Any, Dict, List, Optional, Tuple, Union | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from ..util import escape_html, minify_html, registry | ||||
| from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS | ||||
| from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS | ||||
| from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN | ||||
| from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL | ||||
| from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS | ||||
| from .templates import TPL_TITLE | ||||
| from .templates import ( | ||||
|     TPL_DEP_ARCS, | ||||
|     TPL_DEP_SVG, | ||||
|     TPL_DEP_WORDS, | ||||
|     TPL_DEP_WORDS_LEMMA, | ||||
|     TPL_ENT, | ||||
|     TPL_ENT_RTL, | ||||
|     TPL_ENTS, | ||||
|     TPL_FIGURE, | ||||
|     TPL_KB_LINK, | ||||
|     TPL_PAGE, | ||||
|     TPL_SPAN, | ||||
|     TPL_SPAN_RTL, | ||||
|     TPL_SPAN_SLICE, | ||||
|     TPL_SPAN_SLICE_RTL, | ||||
|     TPL_SPAN_START, | ||||
|     TPL_SPAN_START_RTL, | ||||
|     TPL_SPANS, | ||||
|     TPL_TITLE, | ||||
| ) | ||||
| 
 | ||||
| DEFAULT_LANG = "en" | ||||
| DEFAULT_DIR = "ltr" | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from typing import Literal | ||||
| import warnings | ||||
| from typing import Literal | ||||
| 
 | ||||
| 
 | ||||
| class ErrorsWithCodes(type): | ||||
|  | @ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes): | |||
|             "ignoring the duplicate entry.") | ||||
|     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " | ||||
|             "incorrect. Modify PhraseMatcher._terminal_hash to fix.") | ||||
|     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " | ||||
|     W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " | ||||
|             "the Knowledge Base.") | ||||
|     W026 = ("Unable to set all sentence boundaries from dependency parses. If " | ||||
|             "you are constructing a parse tree incrementally by setting " | ||||
|  | @ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes): | |||
|             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") | ||||
|     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") | ||||
| 
 | ||||
|     # v4 warning strings | ||||
|     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") | ||||
|     W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " | ||||
|             "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " | ||||
|             "to return `True` in `.supports_prior_probs`.") | ||||
| 
 | ||||
| 
 | ||||
| class Errors(metaclass=ErrorsWithCodes): | ||||
|  | @ -542,6 +546,8 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|             "during training, make sure to include it in 'annotating components'") | ||||
| 
 | ||||
|     # New errors added in v3.x | ||||
|     E850 = ("The PretrainVectors objective currently only supports default or " | ||||
|             "floret vectors, not {mode} vectors.") | ||||
|     E851 = ("The 'textcat' component labels should only have values of 0 or 1, " | ||||
|             "but found value of '{val}'.") | ||||
|     E852 = ("The tar file pulled from the remote attempted an unsafe path " | ||||
|  | @ -922,7 +928,7 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|     E1029 = ("Edit tree cannot be applied to form.") | ||||
|     E1030 = ("Edit tree identifier out of range.") | ||||
|     E1031 = ("Could not find gold transition - see logs above.") | ||||
|     E1032 = ("`{var}` should not be {forbidden}, but received {value}.") | ||||
|     E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.") | ||||
|     E1033 = ("Dimension {name} invalid -- only nO, nF, nP") | ||||
|     E1034 = ("Node index {i} out of bounds ({length})") | ||||
|     E1035 = ("Token index {i} out of bounds ({length})") | ||||
|  | @ -951,6 +957,14 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|              "with `displacy.serve(doc, port=port)`") | ||||
|     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " | ||||
|              "or use `auto_select_port=True` to pick an available port automatically.") | ||||
|     E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") | ||||
|     E1052 = ("Unable to copy spans: the character offsets for the span at " | ||||
|              "index {i} in the span group do not align with the tokenization " | ||||
|              "in the target doc.") | ||||
|     E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found" | ||||
|              " 'min_length': {min_length}, 'max_length': {max_length}") | ||||
|     E1054 = ("The text, including whitespace, must match between reference and " | ||||
|              "predicted docs when training {component}.") | ||||
| 
 | ||||
|     # v4 error strings | ||||
|     E4000 = ("Expected a Doc as input, but got: '{type}'") | ||||
|  | @ -961,6 +975,12 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|              "reference and predicted docs.") | ||||
|     E4004 = ("Backprop is not supported when is_train is not set.") | ||||
|     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") | ||||
|     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") | ||||
|     E4007 = ("Span {var} {value} must be {op} Span {existing_var} " | ||||
|              "{existing_value}.") | ||||
|     E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.") | ||||
|     E4009 = ("The '{attr}' parameter should be 'None' or 'True', but found '{value}'.") | ||||
| 
 | ||||
| 
 | ||||
| RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| import warnings | ||||
| 
 | ||||
| from .errors import Warnings | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,3 +1,5 @@ | |||
| from .candidate import Candidate, InMemoryCandidate | ||||
| from .kb import KnowledgeBase | ||||
| from .kb_in_memory import InMemoryLookupKB | ||||
| from .candidate import Candidate, get_candidates, get_candidates_batch | ||||
| 
 | ||||
| __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] | ||||
|  |  | |||
|  | @ -1,12 +1,17 @@ | |||
| from .kb cimport KnowledgeBase | ||||
| from libcpp.vector cimport vector | ||||
| from ..typedefs cimport hash_t | ||||
| 
 | ||||
| # Object used by the Entity Linker that summarizes one entity-alias candidate combination. | ||||
| from ..typedefs cimport hash_t | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| 
 | ||||
| 
 | ||||
| cdef class Candidate: | ||||
|     cdef readonly KnowledgeBase kb | ||||
|     cdef hash_t entity_hash | ||||
|     cdef float entity_freq | ||||
|     cdef vector[float] entity_vector | ||||
|     cdef hash_t alias_hash | ||||
|     cdef float prior_prob | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryCandidate(Candidate): | ||||
|     cdef readonly hash_t _entity_hash | ||||
|     cdef readonly hash_t _alias_hash | ||||
|     cdef vector[float] _entity_vector | ||||
|     cdef float _prior_prob | ||||
|     cdef readonly InMemoryLookupKB _kb | ||||
|     cdef float _entity_freq | ||||
|  |  | |||
|  | @ -1,74 +1,98 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| 
 | ||||
| from typing import Iterable | ||||
| from .kb cimport KnowledgeBase | ||||
| from ..tokens import Span | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| cdef class Candidate: | ||||
|     """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved | ||||
|     to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking | ||||
|     """A `Candidate` object refers to a textual mention that may or may not be resolved | ||||
|     to a specific entity from a Knowledge Base. This will be used as input for the entity linking | ||||
|     algorithm which will disambiguate the various candidates to the correct one. | ||||
|     Each candidate (alias, entity) pair is assigned a certain prior probability. | ||||
|     Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, | ||||
|     is assigned a certain prior probability. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/kb/#candidate-init | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): | ||||
|         self.kb = kb | ||||
|         self.entity_hash = entity_hash | ||||
|         self.entity_freq = entity_freq | ||||
|         self.entity_vector = entity_vector | ||||
|         self.alias_hash = alias_hash | ||||
|         self.prior_prob = prior_prob | ||||
|     def __init__(self): | ||||
|         # Make sure abstract Candidate is not instantiated. | ||||
|         if self.__class__ == Candidate: | ||||
|             raise TypeError( | ||||
|                 Errors.E1046.format(cls_name=self.__class__.__name__) | ||||
|             ) | ||||
| 
 | ||||
|     @property | ||||
|     def entity(self) -> int: | ||||
|         """RETURNS (uint64): hash of the entity's KB ID/name""" | ||||
|         return self.entity_hash | ||||
|     def entity_id(self) -> int: | ||||
|         """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, | ||||
|         otherwise the hash of the entity ID string).""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     @property | ||||
|     def entity_(self) -> str: | ||||
|         """RETURNS (str): ID/name of this entity in the KB""" | ||||
|         return self.kb.vocab.strings[self.entity_hash] | ||||
|     def entity_id_(self) -> str: | ||||
|         """RETURNS (str): String representation of entity ID.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     @property | ||||
|     def alias(self) -> int: | ||||
|         """RETURNS (uint64): hash of the alias""" | ||||
|         return self.alias_hash | ||||
|     def entity_vector(self) -> vector[float]: | ||||
|         """RETURNS (vector[float]): Entity vector.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryCandidate(Candidate): | ||||
|     """Candidate for InMemoryLookupKB.""" | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|         kb: InMemoryLookupKB, | ||||
|         entity_hash: int, | ||||
|         alias_hash: int, | ||||
|         entity_vector: vector[float], | ||||
|         prior_prob: float, | ||||
|         entity_freq: float | ||||
|     ): | ||||
|         """ | ||||
|         kb (InMemoryLookupKB]): InMemoryLookupKB instance. | ||||
|         entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). | ||||
|         entity_freq (int): Entity frequency in KB corpus. | ||||
|         entity_vector (List[float]): Entity embedding. | ||||
|         alias_hash (int): Alias hash. | ||||
|         prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of | ||||
|             the context, this alias - which matches one of this entity's aliases - resolves to one this entity. | ||||
|         """ | ||||
|         super().__init__() | ||||
| 
 | ||||
|         self._entity_hash = entity_hash | ||||
|         self._entity_vector = entity_vector | ||||
|         self._prior_prob = prior_prob | ||||
|         self._kb = kb | ||||
|         self._alias_hash = alias_hash | ||||
|         self._entity_freq = entity_freq | ||||
| 
 | ||||
|     @property | ||||
|     def alias_(self) -> str: | ||||
|         """RETURNS (str): ID of the original alias""" | ||||
|         return self.kb.vocab.strings[self.alias_hash] | ||||
|     def entity_id(self) -> int: | ||||
|         return self._entity_hash | ||||
| 
 | ||||
|     @property | ||||
|     def entity_freq(self) -> float: | ||||
|         return self.entity_freq | ||||
| 
 | ||||
|     @property | ||||
|     def entity_vector(self) -> Iterable[float]: | ||||
|         return self.entity_vector | ||||
|     def entity_vector(self) -> vector[float]: | ||||
|         return self._entity_vector | ||||
| 
 | ||||
|     @property | ||||
|     def prior_prob(self) -> float: | ||||
|         return self.prior_prob | ||||
|         """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to | ||||
|         this entity.""" | ||||
|         return self._prior_prob | ||||
| 
 | ||||
|     @property | ||||
|     def alias(self) -> str: | ||||
|         """RETURNS (str): Alias.""" | ||||
|         return self._kb.vocab.strings[self._alias_hash] | ||||
| 
 | ||||
| def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: | ||||
|     """ | ||||
|     Return candidate entities for a given mention and fetching appropriate entries from the index. | ||||
|     kb (KnowledgeBase): Knowledge base to query. | ||||
|     mention (Span): Entity mention for which to identify candidates. | ||||
|     RETURNS (Iterable[Candidate]): Identified candidates. | ||||
|     """ | ||||
|     return kb.get_candidates(mention) | ||||
|     @property | ||||
|     def entity_id_(self) -> str: | ||||
|         return self._kb.vocab.strings[self._entity_hash] | ||||
| 
 | ||||
| 
 | ||||
| def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: | ||||
|     """ | ||||
|     Return candidate entities for the given mentions and fetching appropriate entries from the index. | ||||
|     kb (KnowledgeBase): Knowledge base to query. | ||||
|     mention (Iterable[Span]): Entity mentions for which to identify candidates. | ||||
|     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. | ||||
|     """ | ||||
|     return kb.get_candidates_batch(mentions) | ||||
|     @property | ||||
|     def entity_freq(self) -> float: | ||||
|         """RETURNS (float): Entity frequency in KB corpus.""" | ||||
|         return self._entity_freq | ||||
|  |  | |||
|  | @ -2,8 +2,10 @@ | |||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from libc.stdint cimport int64_t | ||||
| 
 | ||||
| from ..vocab cimport Vocab | ||||
| 
 | ||||
| 
 | ||||
| cdef class KnowledgeBase: | ||||
|     cdef Pool mem | ||||
|     cdef readonly Vocab vocab | ||||
|  |  | |||
|  | @ -2,12 +2,13 @@ | |||
| 
 | ||||
| from pathlib import Path | ||||
| from typing import Iterable, Tuple, Union | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from .candidate import Candidate | ||||
| from ..tokens import Span | ||||
| from ..util import SimpleFrozenList | ||||
| from ..errors import Errors | ||||
| from ..tokens import Span, SpanGroup | ||||
| from ..util import SimpleFrozenList | ||||
| from .candidate import Candidate | ||||
| 
 | ||||
| 
 | ||||
| cdef class KnowledgeBase: | ||||
|  | @ -30,21 +31,23 @@ cdef class KnowledgeBase: | |||
|         self.entity_vector_length = entity_vector_length | ||||
|         self.mem = Pool() | ||||
| 
 | ||||
|     def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: | ||||
|     def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: | ||||
|         """ | ||||
|         Return candidate entities for specified texts. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|         If no candidate is found for a given text, an empty list is returned. | ||||
|         mentions (Iterable[Span]): Mentions for which to get candidates. | ||||
|         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the | ||||
|         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||
|         probability of the specified mention text resolving to that entity - might be included. | ||||
|         If no candidates are found for a given mention, an empty list is returned. | ||||
|         mentions (SpanGroup): Mentions for which to get candidates. | ||||
|         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. | ||||
|         """ | ||||
|         return [self.get_candidates(span) for span in mentions] | ||||
| 
 | ||||
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]: | ||||
|         """ | ||||
|         Return candidate entities for specified text. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|         If the no candidate is found for a given text, an empty list is returned. | ||||
|         Return candidate entities for a specific mention. Each candidate defines at least the entity and the | ||||
|         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||
|         probability of the specified mention text resolving to that entity - might be included. | ||||
|         If no candidate is found for the given mention, an empty list is returned. | ||||
|         mention (Span): Mention for which to get candidates. | ||||
|         RETURNS (Iterable[Candidate]): Identified candidates. | ||||
|         """ | ||||
|  | @ -106,3 +109,10 @@ cdef class KnowledgeBase: | |||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) | ||||
|         ) | ||||
| 
 | ||||
|     @property | ||||
|     def supports_prior_probs(self) -> bool: | ||||
|         """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) | ||||
|         ) | ||||
|  |  | |||
|  | @ -1,11 +1,11 @@ | |||
| """Knowledge-base for entity or concept linking.""" | ||||
| from preshed.maps cimport PreshMap | ||||
| from libcpp.vector cimport vector | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libc.stdio cimport FILE | ||||
| from libcpp.vector cimport vector | ||||
| from preshed.maps cimport PreshMap | ||||
| 
 | ||||
| from ..structs cimport AliasC, KBEntryC | ||||
| from ..typedefs cimport hash_t | ||||
| from ..structs cimport KBEntryC, AliasC | ||||
| from .kb cimport KnowledgeBase | ||||
| 
 | ||||
| ctypedef vector[KBEntryC] entry_vec | ||||
|  |  | |||
|  | @ -1,24 +1,29 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| from typing import Iterable, Callable, Dict, Any, Union | ||||
| from typing import Any, Callable, Dict, Iterable, Union | ||||
| 
 | ||||
| import srsly | ||||
| from preshed.maps cimport PreshMap | ||||
| from cpython.exc cimport PyErr_SetFromErrno | ||||
| from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| from pathlib import Path | ||||
| from cpython.exc cimport PyErr_SetFromErrno | ||||
| from libc.stdint cimport int32_t, int64_t | ||||
| from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite | ||||
| from libcpp.vector cimport vector | ||||
| from preshed.maps cimport PreshMap | ||||
| 
 | ||||
| import warnings | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ..tokens import Span | ||||
| 
 | ||||
| from ..typedefs cimport hash_t | ||||
| from ..errors import Errors, Warnings | ||||
| 
 | ||||
| from .. import util | ||||
| from ..errors import Errors, Warnings | ||||
| from ..util import SimpleFrozenList, ensure_path | ||||
| 
 | ||||
| from ..vocab cimport Vocab | ||||
| from .kb cimport KnowledgeBase | ||||
| from .candidate import Candidate as Candidate | ||||
| 
 | ||||
| from .candidate import InMemoryCandidate | ||||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryLookupKB(KnowledgeBase): | ||||
|  | @ -226,10 +231,10 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|             alias_entry.probs = probs | ||||
|             self._aliases_table[alias_index] = alias_entry | ||||
| 
 | ||||
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]: | ||||
|         return self.get_alias_candidates(mention.text)  # type: ignore | ||||
|     def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: | ||||
|         return self._get_alias_candidates(mention.text)  # type: ignore | ||||
| 
 | ||||
|     def get_alias_candidates(self, str alias) -> Iterable[Candidate]: | ||||
|     def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: | ||||
|         """ | ||||
|         Return candidate entities for an alias. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|  | @ -241,14 +246,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         alias_index = <int64_t>self._alias_index.get(alias_hash) | ||||
|         alias_entry = self._aliases_table[alias_index] | ||||
| 
 | ||||
|         return [Candidate(kb=self, | ||||
|         return [ | ||||
|             InMemoryCandidate( | ||||
|                 kb=self, | ||||
|                 entity_hash=self._entries[entry_index].entity_hash, | ||||
|                           entity_freq=self._entries[entry_index].freq, | ||||
|                           entity_vector=self._vectors_table[self._entries[entry_index].vector_index], | ||||
|                 alias_hash=alias_hash, | ||||
|                           prior_prob=prior_prob) | ||||
|                 entity_vector=self._vectors_table[self._entries[entry_index].vector_index], | ||||
|                 prior_prob=prior_prob, | ||||
|                 entity_freq=self._entries[entry_index].freq | ||||
|             ) | ||||
|             for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) | ||||
|                 if entry_index != 0] | ||||
|             if entry_index != 0 | ||||
|         ] | ||||
| 
 | ||||
|     def get_vector(self, str entity): | ||||
|         cdef hash_t entity_hash = self.vocab.strings[entity] | ||||
|  | @ -279,6 +288,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|         return 0.0 | ||||
| 
 | ||||
|     def supports_prior_probs(self) -> bool: | ||||
|         return True | ||||
| 
 | ||||
|     def to_bytes(self, **kwargs): | ||||
|         """Serialize the current state to a binary string. | ||||
|         """ | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class AfrikaansDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,12 +1,11 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...attrs import LANG | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...attrs import LANG | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| 
 | ||||
| class AmharicDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,11 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | ||||
| from ..char_classes import UNITS, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA_UPPER, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| from ...symbols import ORTH, NORM | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class ArabicDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,11 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY | ||||
| from ..char_classes import UNITS, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA_UPPER, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     LIST_PUNCT | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class AzerbaijaniDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| # Eleven, twelve etc. are written separate: on bir, on iki | ||||
| 
 | ||||
| _num_words = [ | ||||
|  |  | |||
|  | @ -1,12 +1,14 @@ | |||
| from ...attrs import LANG | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...util import update_exc | ||||
| from ..punctuation import ( | ||||
|     COMBINING_DIACRITICS_TOKENIZER_INFIXES, | ||||
|     COMBINING_DIACRITICS_TOKENIZER_SUFFIXES, | ||||
| ) | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES | ||||
| from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...attrs import LANG | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| 
 | ||||
| class BulgarianDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "нула", | ||||
|     "едно", | ||||
|  |  | |||
|  | @ -4,8 +4,7 @@ References: | |||
|     (countries, occupations, fields of studies and more). | ||||
| """ | ||||
| 
 | ||||
| from ...symbols import ORTH, NORM | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,10 +1,12 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| from ...language import BaseDefaults, Language | ||||
| from ...pipeline import Lemmatizer | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class BengaliDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,14 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | ||||
| from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS | ||||
| 
 | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     CONCAT_QUOTES, | ||||
|     HYPHENS, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     UNITS, | ||||
| ) | ||||
| 
 | ||||
| _currency = r"\$¢£€¥฿৳" | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,14 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import CatalanLemmatizer | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class CatalanDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "zero", | ||||
|     "un", | ||||
|  |  | |||
|  | @ -1,9 +1,18 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS | ||||
| from ..char_classes import LIST_CURRENCY | ||||
| from ..char_classes import CURRENCY | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT | ||||
| from ..char_classes import merge_chars, _units | ||||
| 
 | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     PUNCT, | ||||
|     _units, | ||||
|     merge_chars, | ||||
| ) | ||||
| 
 | ||||
| ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from ...tokens import Doc, Span | ||||
| from ...symbols import NOUN, PROPN | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .stop_words import STOP_WORDS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class CzechDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class DanishDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| 
 | ||||
| # Source http://fjern-uv.dk/tal.php | ||||
| _num_words = """nul | ||||
| en et to tre fire fem seks syv otte ni ti | ||||
|  |  | |||
|  | @ -1,8 +1,13 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
| ) | ||||
| from ..punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| 
 | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
| 
 | ||||
| _infixes = ( | ||||
|  |  | |||
|  | @ -1,7 +1,8 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from ...tokens import Doc, Span | ||||
| from ...symbols import NOUN, PROPN, PRON, VERB, AUX | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...errors import Errors | ||||
| from ...symbols import AUX, NOUN, PRON, PROPN, VERB | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
| def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||
|  |  | |||
|  | @ -2,10 +2,9 @@ | |||
| Tokenizer Exceptions. | ||||
| Source: https://forkortelse.dk/ and various others. | ||||
| """ | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from ...language import BaseDefaults, Language | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class GermanDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,9 +1,18 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES | ||||
| from ..char_classes import CURRENCY, UNITS, PUNCT | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     PUNCT, | ||||
|     UNITS, | ||||
| ) | ||||
| from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES | ||||
| 
 | ||||
| 
 | ||||
| _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES | ||||
| 
 | ||||
| _suffixes = ( | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = { | ||||
|     "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class LowerSorbianDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import GreekLemmatizer | ||||
| from ...language import Language, BaseDefaults | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class GreekDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| def get_pos_from_wiktionary(): | ||||
|     import re | ||||
| 
 | ||||
|     from gensim.corpora.wikicorpus import extract_pages | ||||
| 
 | ||||
|     regex = re.compile(r"==={{(\w+)\|el}}===") | ||||
|  |  | |||
|  | @ -1,6 +1,16 @@ | |||
| from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY | ||||
| from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS | ||||
| from ..char_classes import CONCAT_QUOTES, CURRENCY | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     HYPHENS, | ||||
|     LIST_CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
| ) | ||||
| 
 | ||||
| _units = ( | ||||
|     "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| 
 | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import EnglishLemmatizer | ||||
| from ...language import Language, BaseDefaults | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class EnglishDefaults(BaseDefaults): | ||||
|  |  | |||
|  | @ -1,5 +1,12 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA | ||||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     HYPHENS, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
| ) | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|  |  | |||
|  | @ -1,7 +1,7 @@ | |||
| from typing import Union, Iterator, Tuple | ||||
| from typing import Iterator, Tuple, Union | ||||
| 
 | ||||
| from ...symbols import NOUN, PROPN, PRON | ||||
| from ...errors import Errors | ||||
| from ...symbols import NOUN, PRON, PROPN | ||||
| from ...tokens import Doc, Span | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,8 @@ | |||
| from typing import Dict, List | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc: Dict[str, List[Dict]] = {} | ||||
| _exclude = [ | ||||
|  |  | |||
|  | @ -1,12 +1,14 @@ | |||
| from typing import Optional, Callable | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
| from thinc.api import Model | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| 
 | ||||
| from ...language import BaseDefaults, Language | ||||
| from .lemmatizer import SpanishLemmatizer | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .lex_attrs import LEX_ATTRS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| from ...language import Language, BaseDefaults | ||||
| from .stop_words import STOP_WORDS | ||||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class SpanishDefaults(BaseDefaults): | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user