mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge pull request #13046 from explosion/docs/llm_main
Sync `docs/llm_develop` with `docs/llm_main`
This commit is contained in:
		
						commit
						df07c4734b
					
				
							
								
								
									
										6
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -58,10 +58,8 @@ jobs:
 | 
				
			||||||
      fail-fast: true
 | 
					      fail-fast: true
 | 
				
			||||||
      matrix:
 | 
					      matrix:
 | 
				
			||||||
        os: [ubuntu-latest, windows-latest, macos-latest]
 | 
					        os: [ubuntu-latest, windows-latest, macos-latest]
 | 
				
			||||||
        python_version: ["3.11"]
 | 
					        python_version: ["3.11", "3.12.0-rc.2"]
 | 
				
			||||||
        include:
 | 
					        include:
 | 
				
			||||||
          - os: ubuntu-20.04
 | 
					 | 
				
			||||||
            python_version: "3.6"
 | 
					 | 
				
			||||||
          - os: windows-latest
 | 
					          - os: windows-latest
 | 
				
			||||||
            python_version: "3.7"
 | 
					            python_version: "3.7"
 | 
				
			||||||
          - os: macos-latest
 | 
					          - os: macos-latest
 | 
				
			||||||
| 
						 | 
					@ -95,7 +93,7 @@ jobs:
 | 
				
			||||||
      - name: Run mypy
 | 
					      - name: Run mypy
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
          python -m mypy spacy
 | 
					          python -m mypy spacy
 | 
				
			||||||
        if: matrix.python_version != '3.6'
 | 
					        if: matrix.python_version != '3.7'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      - name: Delete source directory and .egg-info
 | 
					      - name: Delete source directory and .egg-info
 | 
				
			||||||
        run: |
 | 
					        run: |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial
 | 
				
			||||||
open-source software, released under the
 | 
					open-source software, released under the
 | 
				
			||||||
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 | 
					[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
💫 **Version 3.6 out now!**
 | 
					💫 **Version 3.7 out now!**
 | 
				
			||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
					[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
 | 
					[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
 | 
				
			||||||
| 
						 | 
					@ -108,7 +108,7 @@ For detailed installation instructions, see the
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
 | 
					- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
 | 
				
			||||||
  Studio)
 | 
					  Studio)
 | 
				
			||||||
- **Python version**: Python 3.6+ (only 64 bit)
 | 
					- **Python version**: Python 3.7+ (only 64 bit)
 | 
				
			||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
 | 
					- **Package managers**: [pip] · [conda] (via `conda-forge`)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[pip]: https://pypi.org/project/spacy/
 | 
					[pip]: https://pypi.org/project/spacy/
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,6 @@
 | 
				
			||||||
# build version constraints for use with wheelwright + multibuild
 | 
					# build version constraints for use with wheelwright
 | 
				
			||||||
numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
 | 
					numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
 | 
				
			||||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
 | 
					numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
 | 
				
			||||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 | 
					numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
 | 
				
			||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 | 
					numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 | 
				
			||||||
numpy==1.19.3; python_version=='3.9'
 | 
					numpy>=1.25.0; python_version>='3.9'
 | 
				
			||||||
numpy==1.21.3; python_version=='3.10'
 | 
					 | 
				
			||||||
numpy==1.23.2; python_version=='3.11'
 | 
					 | 
				
			||||||
numpy; python_version>='3.12'
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,17 @@
 | 
				
			||||||
# Listeners
 | 
					# Listeners
 | 
				
			||||||
 | 
					
 | 
				
			||||||
1. [Overview](#1-overview)
 | 
					- [1. Overview](#1-overview)
 | 
				
			||||||
2. [Initialization](#2-initialization)
 | 
					- [2. Initialization](#2-initialization)
 | 
				
			||||||
   - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
 | 
					  - [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
 | 
				
			||||||
   - [B. Shape inference](#2b-shape-inference)
 | 
					  - [2B. Shape inference](#2b-shape-inference)
 | 
				
			||||||
3. [Internal communication](#3-internal-communication)
 | 
					- [3. Internal communication](#3-internal-communication)
 | 
				
			||||||
   - [A. During prediction](#3a-during-prediction)
 | 
					  - [3A. During prediction](#3a-during-prediction)
 | 
				
			||||||
   - [B. During training](#3b-during-training)
 | 
					  - [3B. During training](#3b-during-training)
 | 
				
			||||||
   - [C. Frozen components](#3c-frozen-components)
 | 
					    - [Training with multiple listeners](#training-with-multiple-listeners)
 | 
				
			||||||
4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
 | 
					  - [3C. Frozen components](#3c-frozen-components)
 | 
				
			||||||
 | 
					    - [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
 | 
				
			||||||
 | 
					    - [The upstream component is frozen](#the-upstream-component-is-frozen)
 | 
				
			||||||
 | 
					- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 1. Overview
 | 
					## 1. Overview
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -218,3 +221,15 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
 | 
				
			||||||
The new config and model are then properly stored on the `nlp` object.
 | 
					The new config and model are then properly stored on the `nlp` object.
 | 
				
			||||||
Note that this functionality (running the replacement for a transformer listener) was broken prior to
 | 
					Note that this functionality (running the replacement for a transformer listener) was broken prior to
 | 
				
			||||||
`spacy-transformers` 1.0.5.
 | 
					`spacy-transformers` 1.0.5.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
 | 
				
			||||||
 | 
					the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
 | 
				
			||||||
 | 
					the method only passes these extra arguments for callbacks that support them:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					def replace_listener_pre_37(copied_tok2vec_model):
 | 
				
			||||||
 | 
					  ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
 | 
				
			||||||
 | 
					  ...
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,8 +5,9 @@ requires = [
 | 
				
			||||||
    "cymem>=2.0.2,<2.1.0",
 | 
					    "cymem>=2.0.2,<2.1.0",
 | 
				
			||||||
    "preshed>=3.0.2,<3.1.0",
 | 
					    "preshed>=3.0.2,<3.1.0",
 | 
				
			||||||
    "murmurhash>=0.28.0,<1.1.0",
 | 
					    "murmurhash>=0.28.0,<1.1.0",
 | 
				
			||||||
    "thinc>=8.1.8,<8.2.0",
 | 
					    "thinc>=8.1.8,<8.3.0",
 | 
				
			||||||
    "numpy>=1.15.0",
 | 
					    "numpy>=1.15.0; python_version < '3.9'",
 | 
				
			||||||
 | 
					    "numpy>=1.25.0; python_version >= '3.9'",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
build-backend = "setuptools.build_meta"
 | 
					build-backend = "setuptools.build_meta"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,17 +3,18 @@ spacy-legacy>=3.0.11,<3.1.0
 | 
				
			||||||
spacy-loggers>=1.0.0,<2.0.0
 | 
					spacy-loggers>=1.0.0,<2.0.0
 | 
				
			||||||
cymem>=2.0.2,<2.1.0
 | 
					cymem>=2.0.2,<2.1.0
 | 
				
			||||||
preshed>=3.0.2,<3.1.0
 | 
					preshed>=3.0.2,<3.1.0
 | 
				
			||||||
thinc>=8.1.8,<8.2.0
 | 
					thinc>=8.1.8,<8.3.0
 | 
				
			||||||
ml_datasets>=0.2.0,<0.3.0
 | 
					ml_datasets>=0.2.0,<0.3.0
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.9.1,<1.2.0
 | 
					wasabi>=0.9.1,<1.2.0
 | 
				
			||||||
srsly>=2.4.3,<3.0.0
 | 
					srsly>=2.4.3,<3.0.0
 | 
				
			||||||
catalogue>=2.0.6,<2.1.0
 | 
					catalogue>=2.0.6,<2.1.0
 | 
				
			||||||
typer>=0.3.0,<0.10.0
 | 
					typer>=0.3.0,<0.10.0
 | 
				
			||||||
pathy>=0.10.0
 | 
					 | 
				
			||||||
smart-open>=5.2.1,<7.0.0
 | 
					smart-open>=5.2.1,<7.0.0
 | 
				
			||||||
 | 
					weasel>=0.1.0,<0.4.0
 | 
				
			||||||
# Third party dependencies
 | 
					# Third party dependencies
 | 
				
			||||||
numpy>=1.15.0
 | 
					numpy>=1.15.0; python_version < "3.9"
 | 
				
			||||||
 | 
					numpy>=1.19.0; python_version >= "3.9"
 | 
				
			||||||
requests>=2.13.0,<3.0.0
 | 
					requests>=2.13.0,<3.0.0
 | 
				
			||||||
tqdm>=4.38.0,<5.0.0
 | 
					tqdm>=4.38.0,<5.0.0
 | 
				
			||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 | 
					pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 | 
				
			||||||
| 
						 | 
					@ -31,12 +32,11 @@ pytest-timeout>=1.3.0,<2.0.0
 | 
				
			||||||
mock>=2.0.0,<3.0.0
 | 
					mock>=2.0.0,<3.0.0
 | 
				
			||||||
flake8>=3.8.0,<6.0.0
 | 
					flake8>=3.8.0,<6.0.0
 | 
				
			||||||
hypothesis>=3.27.0,<7.0.0
 | 
					hypothesis>=3.27.0,<7.0.0
 | 
				
			||||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
 | 
					mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 | 
				
			||||||
types-dataclasses>=0.1.3; python_version < "3.7"
 | 
					 | 
				
			||||||
types-mock>=0.1.1
 | 
					types-mock>=0.1.1
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
types-requests
 | 
					types-requests
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
black==22.3.0
 | 
					black==22.3.0
 | 
				
			||||||
cython-lint>=0.15.0; python_version >= "3.7"
 | 
					cython-lint>=0.15.0
 | 
				
			||||||
isort>=5.0,<6.0
 | 
					isort>=5.0,<6.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										21
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								setup.cfg
									
									
									
									
									
								
							| 
						 | 
					@ -17,7 +17,6 @@ classifiers =
 | 
				
			||||||
    Operating System :: Microsoft :: Windows
 | 
					    Operating System :: Microsoft :: Windows
 | 
				
			||||||
    Programming Language :: Cython
 | 
					    Programming Language :: Cython
 | 
				
			||||||
    Programming Language :: Python :: 3
 | 
					    Programming Language :: Python :: 3
 | 
				
			||||||
    Programming Language :: Python :: 3.6
 | 
					 | 
				
			||||||
    Programming Language :: Python :: 3.7
 | 
					    Programming Language :: Python :: 3.7
 | 
				
			||||||
    Programming Language :: Python :: 3.8
 | 
					    Programming Language :: Python :: 3.8
 | 
				
			||||||
    Programming Language :: Python :: 3.9
 | 
					    Programming Language :: Python :: 3.9
 | 
				
			||||||
| 
						 | 
					@ -31,15 +30,18 @@ project_urls =
 | 
				
			||||||
[options]
 | 
					[options]
 | 
				
			||||||
zip_safe = false
 | 
					zip_safe = false
 | 
				
			||||||
include_package_data = true
 | 
					include_package_data = true
 | 
				
			||||||
python_requires = >=3.6
 | 
					python_requires = >=3.7
 | 
				
			||||||
 | 
					# NOTE: This section is superseded by pyproject.toml and will be removed in
 | 
				
			||||||
 | 
					# spaCy v4
 | 
				
			||||||
setup_requires =
 | 
					setup_requires =
 | 
				
			||||||
    cython>=0.25,<3.0
 | 
					    cython>=0.25,<3.0
 | 
				
			||||||
    numpy>=1.15.0
 | 
					    numpy>=1.15.0; python_version < "3.9"
 | 
				
			||||||
 | 
					    numpy>=1.19.0; python_version >= "3.9"
 | 
				
			||||||
    # We also need our Cython packages here to compile against
 | 
					    # We also need our Cython packages here to compile against
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    thinc>=8.1.8,<8.2.0
 | 
					    thinc>=8.1.8,<8.3.0
 | 
				
			||||||
install_requires =
 | 
					install_requires =
 | 
				
			||||||
    # Our libraries
 | 
					    # Our libraries
 | 
				
			||||||
    spacy-legacy>=3.0.11,<3.1.0
 | 
					    spacy-legacy>=3.0.11,<3.1.0
 | 
				
			||||||
| 
						 | 
					@ -47,16 +49,17 @@ install_requires =
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    thinc>=8.1.8,<8.2.0
 | 
					    thinc>=8.1.8,<8.3.0
 | 
				
			||||||
    wasabi>=0.9.1,<1.2.0
 | 
					    wasabi>=0.9.1,<1.2.0
 | 
				
			||||||
    srsly>=2.4.3,<3.0.0
 | 
					    srsly>=2.4.3,<3.0.0
 | 
				
			||||||
    catalogue>=2.0.6,<2.1.0
 | 
					    catalogue>=2.0.6,<2.1.0
 | 
				
			||||||
 | 
					    weasel>=0.1.0,<0.4.0
 | 
				
			||||||
    # Third-party dependencies
 | 
					    # Third-party dependencies
 | 
				
			||||||
    typer>=0.3.0,<0.10.0
 | 
					    typer>=0.3.0,<0.10.0
 | 
				
			||||||
    pathy>=0.10.0
 | 
					 | 
				
			||||||
    smart-open>=5.2.1,<7.0.0
 | 
					    smart-open>=5.2.1,<7.0.0
 | 
				
			||||||
    tqdm>=4.38.0,<5.0.0
 | 
					    tqdm>=4.38.0,<5.0.0
 | 
				
			||||||
    numpy>=1.15.0
 | 
					    numpy>=1.15.0; python_version < "3.9"
 | 
				
			||||||
 | 
					    numpy>=1.19.0; python_version >= "3.9"
 | 
				
			||||||
    requests>=2.13.0,<3.0.0
 | 
					    requests>=2.13.0,<3.0.0
 | 
				
			||||||
    pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 | 
					    pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
 | 
				
			||||||
    jinja2
 | 
					    jinja2
 | 
				
			||||||
| 
						 | 
					@ -74,9 +77,7 @@ console_scripts =
 | 
				
			||||||
lookups =
 | 
					lookups =
 | 
				
			||||||
    spacy_lookups_data>=1.0.3,<1.1.0
 | 
					    spacy_lookups_data>=1.0.3,<1.1.0
 | 
				
			||||||
transformers =
 | 
					transformers =
 | 
				
			||||||
    spacy_transformers>=1.1.2,<1.3.0
 | 
					    spacy_transformers>=1.1.2,<1.4.0
 | 
				
			||||||
ray =
 | 
					 | 
				
			||||||
    spacy_ray>=0.1.0,<1.0.0
 | 
					 | 
				
			||||||
cuda =
 | 
					cuda =
 | 
				
			||||||
    cupy>=5.0.0b4,<13.0.0
 | 
					    cupy>=5.0.0b4,<13.0.0
 | 
				
			||||||
cuda80 =
 | 
					cuda80 =
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -78,6 +78,7 @@ COMPILER_DIRECTIVES = {
 | 
				
			||||||
    "language_level": -3,
 | 
					    "language_level": -3,
 | 
				
			||||||
    "embedsignature": True,
 | 
					    "embedsignature": True,
 | 
				
			||||||
    "annotation_typing": False,
 | 
					    "annotation_typing": False,
 | 
				
			||||||
 | 
					    "profile": sys.version_info < (3, 12),
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
# Files to copy into the package that are otherwise not included
 | 
					# Files to copy into the package that are otherwise not included
 | 
				
			||||||
COPY_FILES = {
 | 
					COPY_FILES = {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,5 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "3.6.1"
 | 
					__version__ = "3.7.1"
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
__projects__ = "https://github.com/explosion/projects"
 | 
					 | 
				
			||||||
__projects_branch__ = "v3"
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from .errors import Errors
 | 
					from .errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
IOB_STRINGS = ("", "I", "O", "B")
 | 
					IOB_STRINGS = ("", "I", "O", "B")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,13 +22,6 @@ from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
				
			||||||
from .package import package  # noqa: F401
 | 
					from .package import package  # noqa: F401
 | 
				
			||||||
from .pretrain import pretrain  # noqa: F401
 | 
					from .pretrain import pretrain  # noqa: F401
 | 
				
			||||||
from .profile import profile  # noqa: F401
 | 
					from .profile import profile  # noqa: F401
 | 
				
			||||||
from .project.assets import project_assets  # noqa: F401
 | 
					 | 
				
			||||||
from .project.clone import project_clone  # noqa: F401
 | 
					 | 
				
			||||||
from .project.document import project_document  # noqa: F401
 | 
					 | 
				
			||||||
from .project.dvc import project_update_dvc  # noqa: F401
 | 
					 | 
				
			||||||
from .project.pull import project_pull  # noqa: F401
 | 
					 | 
				
			||||||
from .project.push import project_push  # noqa: F401
 | 
					 | 
				
			||||||
from .project.run import project_run  # noqa: F401
 | 
					 | 
				
			||||||
from .train import train_cli  # noqa: F401
 | 
					from .train import train_cli  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .validate import validate  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,10 +25,11 @@ from thinc.api import Config, ConfigValidationError, require_gpu
 | 
				
			||||||
from thinc.util import gpu_is_available
 | 
					from thinc.util import gpu_is_available
 | 
				
			||||||
from typer.main import get_command
 | 
					from typer.main import get_command
 | 
				
			||||||
from wasabi import Printer, msg
 | 
					from wasabi import Printer, msg
 | 
				
			||||||
 | 
					from weasel import app as project_cli
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
from ..compat import Literal
 | 
					from ..compat import Literal
 | 
				
			||||||
from ..schemas import ProjectConfigSchema, validate
 | 
					from ..schemas import validate
 | 
				
			||||||
from ..util import (
 | 
					from ..util import (
 | 
				
			||||||
    ENV_VARS,
 | 
					    ENV_VARS,
 | 
				
			||||||
    SimpleFrozenDict,
 | 
					    SimpleFrozenDict,
 | 
				
			||||||
| 
						 | 
					@ -40,15 +41,10 @@ from ..util import (
 | 
				
			||||||
    run_command,
 | 
					    run_command,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if TYPE_CHECKING:
 | 
					 | 
				
			||||||
    from pathy import FluidPath  # noqa: F401
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
SDIST_SUFFIX = ".tar.gz"
 | 
					SDIST_SUFFIX = ".tar.gz"
 | 
				
			||||||
WHEEL_SUFFIX = "-py3-none-any.whl"
 | 
					WHEEL_SUFFIX = "-py3-none-any.whl"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PROJECT_FILE = "project.yml"
 | 
					PROJECT_FILE = "project.yml"
 | 
				
			||||||
PROJECT_LOCK = "project.lock"
 | 
					 | 
				
			||||||
COMMAND = "python -m spacy"
 | 
					COMMAND = "python -m spacy"
 | 
				
			||||||
NAME = "spacy"
 | 
					NAME = "spacy"
 | 
				
			||||||
HELP = """spaCy Command-line Interface
 | 
					HELP = """spaCy Command-line Interface
 | 
				
			||||||
| 
						 | 
					@ -74,11 +70,10 @@ Opt = typer.Option
 | 
				
			||||||
 | 
					
 | 
				
			||||||
app = typer.Typer(name=NAME, help=HELP)
 | 
					app = typer.Typer(name=NAME, help=HELP)
 | 
				
			||||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
 | 
					benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
 | 
				
			||||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 | 
					 | 
				
			||||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 | 
					debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 | 
				
			||||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
 | 
					init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
app.add_typer(project_cli)
 | 
					app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
 | 
				
			||||||
app.add_typer(debug_cli)
 | 
					app.add_typer(debug_cli)
 | 
				
			||||||
app.add_typer(benchmark_cli)
 | 
					app.add_typer(benchmark_cli)
 | 
				
			||||||
app.add_typer(init_cli)
 | 
					app.add_typer(init_cli)
 | 
				
			||||||
| 
						 | 
					@ -153,148 +148,6 @@ def _parse_override(value: Any) -> Any:
 | 
				
			||||||
        return str(value)
 | 
					        return str(value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def load_project_config(
 | 
					 | 
				
			||||||
    path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
 | 
					 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					 | 
				
			||||||
    """Load the project.yml file from a directory and validate it. Also make
 | 
					 | 
				
			||||||
    sure that all directories defined in the config exist.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    path (Path): The path to the project directory.
 | 
					 | 
				
			||||||
    interpolate (bool): Whether to substitute project variables.
 | 
					 | 
				
			||||||
    overrides (Dict[str, Any]): Optional config overrides.
 | 
					 | 
				
			||||||
    RETURNS (Dict[str, Any]): The loaded project.yml.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config_path = path / PROJECT_FILE
 | 
					 | 
				
			||||||
    if not config_path.exists():
 | 
					 | 
				
			||||||
        msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
 | 
					 | 
				
			||||||
    invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        config = srsly.read_yaml(config_path)
 | 
					 | 
				
			||||||
    except ValueError as e:
 | 
					 | 
				
			||||||
        msg.fail(invalid_err, e, exits=1)
 | 
					 | 
				
			||||||
    errors = validate(ProjectConfigSchema, config)
 | 
					 | 
				
			||||||
    if errors:
 | 
					 | 
				
			||||||
        msg.fail(invalid_err)
 | 
					 | 
				
			||||||
        print("\n".join(errors))
 | 
					 | 
				
			||||||
        sys.exit(1)
 | 
					 | 
				
			||||||
    validate_project_version(config)
 | 
					 | 
				
			||||||
    validate_project_commands(config)
 | 
					 | 
				
			||||||
    if interpolate:
 | 
					 | 
				
			||||||
        err = f"{PROJECT_FILE} validation error"
 | 
					 | 
				
			||||||
        with show_validation_error(title=err, hint_fill=False):
 | 
					 | 
				
			||||||
            config = substitute_project_variables(config, overrides)
 | 
					 | 
				
			||||||
    # Make sure directories defined in config exist
 | 
					 | 
				
			||||||
    for subdir in config.get("directories", []):
 | 
					 | 
				
			||||||
        dir_path = path / subdir
 | 
					 | 
				
			||||||
        if not dir_path.exists():
 | 
					 | 
				
			||||||
            dir_path.mkdir(parents=True)
 | 
					 | 
				
			||||||
    return config
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def substitute_project_variables(
 | 
					 | 
				
			||||||
    config: Dict[str, Any],
 | 
					 | 
				
			||||||
    overrides: Dict[str, Any] = SimpleFrozenDict(),
 | 
					 | 
				
			||||||
    key: str = "vars",
 | 
					 | 
				
			||||||
    env_key: str = "env",
 | 
					 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					 | 
				
			||||||
    """Interpolate variables in the project file using the config system.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    config (Dict[str, Any]): The project config.
 | 
					 | 
				
			||||||
    overrides (Dict[str, Any]): Optional config overrides.
 | 
					 | 
				
			||||||
    key (str): Key containing variables in project config.
 | 
					 | 
				
			||||||
    env_key (str): Key containing environment variable mapping in project config.
 | 
					 | 
				
			||||||
    RETURNS (Dict[str, Any]): The interpolated project config.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config.setdefault(key, {})
 | 
					 | 
				
			||||||
    config.setdefault(env_key, {})
 | 
					 | 
				
			||||||
    # Substitute references to env vars with their values
 | 
					 | 
				
			||||||
    for config_var, env_var in config[env_key].items():
 | 
					 | 
				
			||||||
        config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
 | 
					 | 
				
			||||||
    # Need to put variables in the top scope again so we can have a top-level
 | 
					 | 
				
			||||||
    # section "project" (otherwise, a list of commands in the top scope wouldn't)
 | 
					 | 
				
			||||||
    # be allowed by Thinc's config system
 | 
					 | 
				
			||||||
    cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
 | 
					 | 
				
			||||||
    cfg = Config().from_str(cfg.to_str(), overrides=overrides)
 | 
					 | 
				
			||||||
    interpolated = cfg.interpolate()
 | 
					 | 
				
			||||||
    return dict(interpolated["project"])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def validate_project_version(config: Dict[str, Any]) -> None:
 | 
					 | 
				
			||||||
    """If the project defines a compatible spaCy version range, chec that it's
 | 
					 | 
				
			||||||
    compatible with the current version of spaCy.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    config (Dict[str, Any]): The loaded config.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    spacy_version = config.get("spacy_version", None)
 | 
					 | 
				
			||||||
    if spacy_version and not is_compatible_version(about.__version__, spacy_version):
 | 
					 | 
				
			||||||
        err = (
 | 
					 | 
				
			||||||
            f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
 | 
					 | 
				
			||||||
            f"that's not compatible with the version of spaCy you're running "
 | 
					 | 
				
			||||||
            f"({about.__version__}). You can edit version requirement in the "
 | 
					 | 
				
			||||||
            f"{PROJECT_FILE} to load it, but the project may not run as expected."
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        msg.fail(err, exits=1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
 | 
					 | 
				
			||||||
    """Check that project commands and workflows are valid, don't contain
 | 
					 | 
				
			||||||
    duplicates, don't clash  and only refer to commands that exist.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    config (Dict[str, Any]): The loaded config.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    command_names = [cmd["name"] for cmd in config.get("commands", [])]
 | 
					 | 
				
			||||||
    workflows = config.get("workflows", {})
 | 
					 | 
				
			||||||
    duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
 | 
					 | 
				
			||||||
    if duplicates:
 | 
					 | 
				
			||||||
        err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
 | 
					 | 
				
			||||||
        msg.fail(err, exits=1)
 | 
					 | 
				
			||||||
    for workflow_name, workflow_steps in workflows.items():
 | 
					 | 
				
			||||||
        if workflow_name in command_names:
 | 
					 | 
				
			||||||
            err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
 | 
					 | 
				
			||||||
            msg.fail(err, exits=1)
 | 
					 | 
				
			||||||
        for step in workflow_steps:
 | 
					 | 
				
			||||||
            if step not in command_names:
 | 
					 | 
				
			||||||
                msg.fail(
 | 
					 | 
				
			||||||
                    f"Unknown command specified in workflow '{workflow_name}': {step}",
 | 
					 | 
				
			||||||
                    f"Workflows can only refer to commands defined in the 'commands' "
 | 
					 | 
				
			||||||
                    f"section of the {PROJECT_FILE}.",
 | 
					 | 
				
			||||||
                    exits=1,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
 | 
					 | 
				
			||||||
    """Get the hash for a JSON-serializable object.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    data: The data to hash.
 | 
					 | 
				
			||||||
    exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
 | 
					 | 
				
			||||||
    RETURNS (str): The hash.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if isinstance(data, dict):
 | 
					 | 
				
			||||||
        data = {k: v for k, v in data.items() if k not in exclude}
 | 
					 | 
				
			||||||
    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
 | 
					 | 
				
			||||||
    return hashlib.md5(data_str).hexdigest()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_checksum(path: Union[Path, str]) -> str:
 | 
					 | 
				
			||||||
    """Get the checksum for a file or directory given its file path. If a
 | 
					 | 
				
			||||||
    directory path is provided, this uses all files in that directory.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    path (Union[Path, str]): The file or directory path.
 | 
					 | 
				
			||||||
    RETURNS (str): The checksum.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    path = Path(path)
 | 
					 | 
				
			||||||
    if not (path.is_file() or path.is_dir()):
 | 
					 | 
				
			||||||
        msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
 | 
					 | 
				
			||||||
    if path.is_file():
 | 
					 | 
				
			||||||
        return hashlib.md5(Path(path).read_bytes()).hexdigest()
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        # TODO: this is currently pretty slow
 | 
					 | 
				
			||||||
        dir_checksum = hashlib.md5()
 | 
					 | 
				
			||||||
        for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
 | 
					 | 
				
			||||||
            dir_checksum.update(sub_file.read_bytes())
 | 
					 | 
				
			||||||
        return dir_checksum.hexdigest()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@contextmanager
 | 
					@contextmanager
 | 
				
			||||||
def show_validation_error(
 | 
					def show_validation_error(
 | 
				
			||||||
    file_path: Optional[Union[str, Path]] = None,
 | 
					    file_path: Optional[Union[str, Path]] = None,
 | 
				
			||||||
| 
						 | 
					@ -352,166 +205,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
 | 
				
			||||||
            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
					            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
 | 
					 | 
				
			||||||
    """Upload a file.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    src (Path): The source path.
 | 
					 | 
				
			||||||
    url (str): The destination URL to upload to.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    import smart_open
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Create parent directories for local paths
 | 
					 | 
				
			||||||
    if isinstance(dest, Path):
 | 
					 | 
				
			||||||
        if not dest.parent.exists():
 | 
					 | 
				
			||||||
            dest.parent.mkdir(parents=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    dest = str(dest)
 | 
					 | 
				
			||||||
    with smart_open.open(dest, mode="wb") as output_file:
 | 
					 | 
				
			||||||
        with src.open(mode="rb") as input_file:
 | 
					 | 
				
			||||||
            output_file.write(input_file.read())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def download_file(
 | 
					 | 
				
			||||||
    src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Download a file using smart_open.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    url (str): The URL of the file.
 | 
					 | 
				
			||||||
    dest (Path): The destination path.
 | 
					 | 
				
			||||||
    force (bool): Whether to force download even if file exists.
 | 
					 | 
				
			||||||
        If False, the download will be skipped.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    import smart_open
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if dest.exists() and not force:
 | 
					 | 
				
			||||||
        return None
 | 
					 | 
				
			||||||
    src = str(src)
 | 
					 | 
				
			||||||
    with smart_open.open(src, mode="rb", compression="disable") as input_file:
 | 
					 | 
				
			||||||
        with dest.open(mode="wb") as output_file:
 | 
					 | 
				
			||||||
            shutil.copyfileobj(input_file, output_file)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def ensure_pathy(path):
 | 
					 | 
				
			||||||
    """Temporary helper to prevent importing Pathy globally (which can cause
 | 
					 | 
				
			||||||
    slow and annoying Google Cloud warning)."""
 | 
					 | 
				
			||||||
    from pathy import Pathy  # noqa: F811
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return Pathy.fluid(path)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def git_checkout(
 | 
					 | 
				
			||||||
    repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    git_version = get_git_version()
 | 
					 | 
				
			||||||
    if dest.exists():
 | 
					 | 
				
			||||||
        msg.fail("Destination of checkout must not exist", exits=1)
 | 
					 | 
				
			||||||
    if not dest.parent.exists():
 | 
					 | 
				
			||||||
        msg.fail("Parent of destination of checkout must exist", exits=1)
 | 
					 | 
				
			||||||
    if sparse and git_version >= (2, 22):
 | 
					 | 
				
			||||||
        return git_sparse_checkout(repo, subpath, dest, branch)
 | 
					 | 
				
			||||||
    elif sparse:
 | 
					 | 
				
			||||||
        # Only show warnings if the user explicitly wants sparse checkout but
 | 
					 | 
				
			||||||
        # the Git version doesn't support it
 | 
					 | 
				
			||||||
        err_old = (
 | 
					 | 
				
			||||||
            f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
 | 
					 | 
				
			||||||
            f"that doesn't fully support sparse checkout yet."
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
 | 
					 | 
				
			||||||
        msg.warn(
 | 
					 | 
				
			||||||
            f"{err_unk if git_version == (0, 0) else err_old} "
 | 
					 | 
				
			||||||
            f"This means that more files than necessary may be downloaded "
 | 
					 | 
				
			||||||
            f"temporarily. To only download the files needed, make sure "
 | 
					 | 
				
			||||||
            f"you're using Git v2.22 or above."
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    with make_tempdir() as tmp_dir:
 | 
					 | 
				
			||||||
        cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
 | 
					 | 
				
			||||||
        run_command(cmd, capture=True)
 | 
					 | 
				
			||||||
        # We need Path(name) to make sure we also support subdirectories
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            source_path = tmp_dir / Path(subpath)
 | 
					 | 
				
			||||||
            if not is_subpath_of(tmp_dir, source_path):
 | 
					 | 
				
			||||||
                err = f"'{subpath}' is a path outside of the cloned repository."
 | 
					 | 
				
			||||||
                msg.fail(err, repo, exits=1)
 | 
					 | 
				
			||||||
            shutil.copytree(str(source_path), str(dest))
 | 
					 | 
				
			||||||
        except FileNotFoundError:
 | 
					 | 
				
			||||||
            err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
 | 
					 | 
				
			||||||
            msg.fail(err, repo, exits=1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def git_sparse_checkout(repo, subpath, dest, branch):
 | 
					 | 
				
			||||||
    # We're using Git, partial clone and sparse checkout to
 | 
					 | 
				
			||||||
    # only clone the files we need
 | 
					 | 
				
			||||||
    # This ends up being RIDICULOUS. omg.
 | 
					 | 
				
			||||||
    # So, every tutorial and SO post talks about 'sparse checkout'...But they
 | 
					 | 
				
			||||||
    # go and *clone* the whole repo. Worthless. And cloning part of a repo
 | 
					 | 
				
			||||||
    # turns out to be completely broken. The only way to specify a "path" is..
 | 
					 | 
				
			||||||
    # a path *on the server*? The contents of which, specifies the paths. Wat.
 | 
					 | 
				
			||||||
    # Obviously this is hopelessly broken and insecure, because you can query
 | 
					 | 
				
			||||||
    # arbitrary paths on the server! So nobody enables this.
 | 
					 | 
				
			||||||
    # What we have to do is disable *all* files. We could then just checkout
 | 
					 | 
				
			||||||
    # the path, and it'd "work", but be hopelessly slow...Because it goes and
 | 
					 | 
				
			||||||
    # transfers every missing object one-by-one. So the final piece is that we
 | 
					 | 
				
			||||||
    # need to use some weird git internals to fetch the missings in bulk, and
 | 
					 | 
				
			||||||
    # *that* we can do by path.
 | 
					 | 
				
			||||||
    # We're using Git and sparse checkout to only clone the files we need
 | 
					 | 
				
			||||||
    with make_tempdir() as tmp_dir:
 | 
					 | 
				
			||||||
        # This is the "clone, but don't download anything" part.
 | 
					 | 
				
			||||||
        cmd = (
 | 
					 | 
				
			||||||
            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
 | 
					 | 
				
			||||||
            f"-b {branch} --filter=blob:none"
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        run_command(cmd)
 | 
					 | 
				
			||||||
        # Now we need to find the missing filenames for the subpath we want.
 | 
					 | 
				
			||||||
        # Looking for this 'rev-list' command in the git --help? Hah.
 | 
					 | 
				
			||||||
        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
 | 
					 | 
				
			||||||
        ret = run_command(cmd, capture=True)
 | 
					 | 
				
			||||||
        git_repo = _http_to_git(repo)
 | 
					 | 
				
			||||||
        # Now pass those missings into another bit of git internals
 | 
					 | 
				
			||||||
        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
 | 
					 | 
				
			||||||
        if not missings:
 | 
					 | 
				
			||||||
            err = (
 | 
					 | 
				
			||||||
                f"Could not find any relevant files for '{subpath}'. "
 | 
					 | 
				
			||||||
                f"Did you specify a correct and complete path within repo '{repo}' "
 | 
					 | 
				
			||||||
                f"and branch {branch}?"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            msg.fail(err, exits=1)
 | 
					 | 
				
			||||||
        cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
 | 
					 | 
				
			||||||
        run_command(cmd, capture=True)
 | 
					 | 
				
			||||||
        # And finally, we can checkout our subpath
 | 
					 | 
				
			||||||
        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
 | 
					 | 
				
			||||||
        run_command(cmd, capture=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Get a subdirectory of the cloned path, if appropriate
 | 
					 | 
				
			||||||
        source_path = tmp_dir / Path(subpath)
 | 
					 | 
				
			||||||
        if not is_subpath_of(tmp_dir, source_path):
 | 
					 | 
				
			||||||
            err = f"'{subpath}' is a path outside of the cloned repository."
 | 
					 | 
				
			||||||
            msg.fail(err, repo, exits=1)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        shutil.move(str(source_path), str(dest))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def git_repo_branch_exists(repo: str, branch: str) -> bool:
 | 
					 | 
				
			||||||
    """Uses 'git ls-remote' to check if a repository and branch exists
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    repo (str): URL to get repo.
 | 
					 | 
				
			||||||
    branch (str): Branch on repo to check.
 | 
					 | 
				
			||||||
    RETURNS (bool): True if repo:branch exists.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    get_git_version()
 | 
					 | 
				
			||||||
    cmd = f"git ls-remote {repo} {branch}"
 | 
					 | 
				
			||||||
    # We might be tempted to use `--exit-code` with `git ls-remote`, but
 | 
					 | 
				
			||||||
    # `run_command` handles the `returncode` for us, so we'll rely on
 | 
					 | 
				
			||||||
    # the fact that stdout returns '' if the requested branch doesn't exist
 | 
					 | 
				
			||||||
    ret = run_command(cmd, capture=True)
 | 
					 | 
				
			||||||
    exists = ret.stdout != ""
 | 
					 | 
				
			||||||
    return exists
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_git_version(
 | 
					def get_git_version(
 | 
				
			||||||
    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 | 
					    error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
 | 
				
			||||||
) -> Tuple[int, int]:
 | 
					) -> Tuple[int, int]:
 | 
				
			||||||
    """Get the version of git and raise an error if calling 'git --version' fails.
 | 
					    """Get the version of git and raise an error if calling 'git --version' fails.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    error (str): The error message to show.
 | 
					    error (str): The error message to show.
 | 
				
			||||||
    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
 | 
					    RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
 | 
				
			||||||
        (0, 0) if the version couldn't be determined.
 | 
					        (0, 0) if the version couldn't be determined.
 | 
				
			||||||
| 
						 | 
					@ -527,30 +224,6 @@ def get_git_version(
 | 
				
			||||||
    return int(version[0]), int(version[1])
 | 
					    return int(version[0]), int(version[1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _http_to_git(repo: str) -> str:
 | 
					 | 
				
			||||||
    if repo.startswith("http://"):
 | 
					 | 
				
			||||||
        repo = repo.replace(r"http://", r"https://")
 | 
					 | 
				
			||||||
    if repo.startswith(r"https://"):
 | 
					 | 
				
			||||||
        repo = repo.replace("https://", "git@").replace("/", ":", 1)
 | 
					 | 
				
			||||||
        if repo.endswith("/"):
 | 
					 | 
				
			||||||
            repo = repo[:-1]
 | 
					 | 
				
			||||||
        repo = f"{repo}.git"
 | 
					 | 
				
			||||||
    return repo
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def is_subpath_of(parent, child):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Check whether `child` is a path contained within `parent`.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # Based on https://stackoverflow.com/a/37095733 .
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
 | 
					 | 
				
			||||||
    # we can stop using crusty old os.path functions.
 | 
					 | 
				
			||||||
    parent_realpath = os.path.realpath(parent)
 | 
					 | 
				
			||||||
    child_realpath = os.path.realpath(child)
 | 
					 | 
				
			||||||
    return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@overload
 | 
					@overload
 | 
				
			||||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
 | 
					def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
 | 
				
			||||||
    ...
 | 
					    ...
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -133,7 +133,9 @@ def apply(
 | 
				
			||||||
    if len(text_files) > 0:
 | 
					    if len(text_files) > 0:
 | 
				
			||||||
        streams.append(_stream_texts(text_files))
 | 
					        streams.append(_stream_texts(text_files))
 | 
				
			||||||
    datagen = cast(DocOrStrStream, chain(*streams))
 | 
					    datagen = cast(DocOrStrStream, chain(*streams))
 | 
				
			||||||
    for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
 | 
					    for doc in tqdm.tqdm(
 | 
				
			||||||
 | 
					        nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
        docbin.add(doc)
 | 
					        docbin.add(doc)
 | 
				
			||||||
    if output_file.suffix == "":
 | 
					    if output_file.suffix == "":
 | 
				
			||||||
        output_file = output_file.with_suffix(".spacy")
 | 
					        output_file = output_file.with_suffix(".spacy")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,7 +89,7 @@ class Quartiles:
 | 
				
			||||||
def annotate(
 | 
					def annotate(
 | 
				
			||||||
    nlp: Language, docs: List[Doc], batch_size: Optional[int]
 | 
					    nlp: Language, docs: List[Doc], batch_size: Optional[int]
 | 
				
			||||||
) -> numpy.ndarray:
 | 
					) -> numpy.ndarray:
 | 
				
			||||||
    docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
 | 
					    docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
 | 
				
			||||||
    wps = []
 | 
					    wps = []
 | 
				
			||||||
    while True:
 | 
					    while True:
 | 
				
			||||||
        with time_context() as elapsed:
 | 
					        with time_context() as elapsed:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,6 +28,7 @@ def evaluate_cli(
 | 
				
			||||||
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
					    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
				
			||||||
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
					    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
				
			||||||
    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
 | 
					    per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
 | 
				
			||||||
 | 
					    spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -53,6 +54,7 @@ def evaluate_cli(
 | 
				
			||||||
        displacy_limit=displacy_limit,
 | 
					        displacy_limit=displacy_limit,
 | 
				
			||||||
        per_component=per_component,
 | 
					        per_component=per_component,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
 | 
					        spans_key=spans_key,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,7 +71,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
 | 
					def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
 | 
				
			||||||
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
 | 
					    for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,217 +0,0 @@
 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from typing import Any, Dict, Optional
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import requests
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...util import ensure_path, working_dir
 | 
					 | 
				
			||||||
from .._util import (
 | 
					 | 
				
			||||||
    PROJECT_FILE,
 | 
					 | 
				
			||||||
    Arg,
 | 
					 | 
				
			||||||
    Opt,
 | 
					 | 
				
			||||||
    SimpleFrozenDict,
 | 
					 | 
				
			||||||
    download_file,
 | 
					 | 
				
			||||||
    get_checksum,
 | 
					 | 
				
			||||||
    get_git_version,
 | 
					 | 
				
			||||||
    git_checkout,
 | 
					 | 
				
			||||||
    load_project_config,
 | 
					 | 
				
			||||||
    parse_config_overrides,
 | 
					 | 
				
			||||||
    project_cli,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Whether assets are extra if `extra` is not set.
 | 
					 | 
				
			||||||
EXTRA_DEFAULT = False
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command(
 | 
					 | 
				
			||||||
    "assets",
 | 
					 | 
				
			||||||
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def project_assets_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    ctx: typer.Context,  # This is only used to read additional arguments
 | 
					 | 
				
			||||||
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
 | 
					 | 
				
			||||||
    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
 | 
					 | 
				
			||||||
    extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Fetch project assets like datasets and pretrained weights. Assets are
 | 
					 | 
				
			||||||
    defined in the "assets" section of the project.yml. If a checksum is
 | 
					 | 
				
			||||||
    provided in the project.yml, the file is only downloaded if no local file
 | 
					 | 
				
			||||||
    with the same checksum exists.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-assets
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    overrides = parse_config_overrides(ctx.args)
 | 
					 | 
				
			||||||
    project_assets(
 | 
					 | 
				
			||||||
        project_dir,
 | 
					 | 
				
			||||||
        overrides=overrides,
 | 
					 | 
				
			||||||
        sparse_checkout=sparse_checkout,
 | 
					 | 
				
			||||||
        extra=extra,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_assets(
 | 
					 | 
				
			||||||
    project_dir: Path,
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    overrides: Dict[str, Any] = SimpleFrozenDict(),
 | 
					 | 
				
			||||||
    sparse_checkout: bool = False,
 | 
					 | 
				
			||||||
    extra: bool = False,
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Fetch assets for a project using DVC if possible.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): Path to project directory.
 | 
					 | 
				
			||||||
    sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
 | 
					 | 
				
			||||||
                            needed.
 | 
					 | 
				
			||||||
    extra (bool): Whether to download all assets, including those marked as 'extra'.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    project_path = ensure_path(project_dir)
 | 
					 | 
				
			||||||
    config = load_project_config(project_path, overrides=overrides)
 | 
					 | 
				
			||||||
    assets = [
 | 
					 | 
				
			||||||
        asset
 | 
					 | 
				
			||||||
        for asset in config.get("assets", [])
 | 
					 | 
				
			||||||
        if extra or not asset.get("extra", EXTRA_DEFAULT)
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    if not assets:
 | 
					 | 
				
			||||||
        msg.warn(
 | 
					 | 
				
			||||||
            f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
 | 
					 | 
				
			||||||
            exits=0,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    msg.info(f"Fetching {len(assets)} asset(s)")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for asset in assets:
 | 
					 | 
				
			||||||
        dest = (project_dir / asset["dest"]).resolve()
 | 
					 | 
				
			||||||
        checksum = asset.get("checksum")
 | 
					 | 
				
			||||||
        if "git" in asset:
 | 
					 | 
				
			||||||
            git_err = (
 | 
					 | 
				
			||||||
                f"Cloning spaCy project templates requires Git and the 'git' command. "
 | 
					 | 
				
			||||||
                f"Make sure it's installed and that the executable is available."
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            get_git_version(error=git_err)
 | 
					 | 
				
			||||||
            if dest.exists():
 | 
					 | 
				
			||||||
                # If there's already a file, check for checksum
 | 
					 | 
				
			||||||
                if checksum and checksum == get_checksum(dest):
 | 
					 | 
				
			||||||
                    msg.good(
 | 
					 | 
				
			||||||
                        f"Skipping download with matching checksum: {asset['dest']}"
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    if dest.is_dir():
 | 
					 | 
				
			||||||
                        shutil.rmtree(dest)
 | 
					 | 
				
			||||||
                    else:
 | 
					 | 
				
			||||||
                        dest.unlink()
 | 
					 | 
				
			||||||
            if "repo" not in asset["git"] or asset["git"]["repo"] is None:
 | 
					 | 
				
			||||||
                msg.fail(
 | 
					 | 
				
			||||||
                    "A git asset must include 'repo', the repository address.", exits=1
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            if "path" not in asset["git"] or asset["git"]["path"] is None:
 | 
					 | 
				
			||||||
                msg.fail(
 | 
					 | 
				
			||||||
                    "A git asset must include 'path' - use \"\" to get the entire repository.",
 | 
					 | 
				
			||||||
                    exits=1,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            git_checkout(
 | 
					 | 
				
			||||||
                asset["git"]["repo"],
 | 
					 | 
				
			||||||
                asset["git"]["path"],
 | 
					 | 
				
			||||||
                dest,
 | 
					 | 
				
			||||||
                branch=asset["git"].get("branch"),
 | 
					 | 
				
			||||||
                sparse=sparse_checkout,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            msg.good(f"Downloaded asset {dest}")
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            url = asset.get("url")
 | 
					 | 
				
			||||||
            if not url:
 | 
					 | 
				
			||||||
                # project.yml defines asset without URL that the user has to place
 | 
					 | 
				
			||||||
                check_private_asset(dest, checksum)
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            fetch_asset(project_path, url, dest, checksum)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
 | 
					 | 
				
			||||||
    """Check and validate assets without a URL (private assets that the user
 | 
					 | 
				
			||||||
    has to provide themselves) and give feedback about the checksum.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    dest (Path): Destination path of the asset.
 | 
					 | 
				
			||||||
    checksum (Optional[str]): Optional checksum of the expected file.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if not Path(dest).exists():
 | 
					 | 
				
			||||||
        err = f"No URL provided for asset. You need to add this file yourself: {dest}"
 | 
					 | 
				
			||||||
        msg.warn(err)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        if not checksum:
 | 
					 | 
				
			||||||
            msg.good(f"Asset already exists: {dest}")
 | 
					 | 
				
			||||||
        elif checksum == get_checksum(dest):
 | 
					 | 
				
			||||||
            msg.good(f"Asset exists with matching checksum: {dest}")
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            msg.fail(f"Asset available but with incorrect checksum: {dest}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def fetch_asset(
 | 
					 | 
				
			||||||
    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Fetch an asset from a given URL or path. If a checksum is provided and a
 | 
					 | 
				
			||||||
    local file exists, it's only re-downloaded if the checksum doesn't match.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_path (Path): Path to project directory.
 | 
					 | 
				
			||||||
    url (str): URL or path to asset.
 | 
					 | 
				
			||||||
    checksum (Optional[str]): Optional expected checksum of local file.
 | 
					 | 
				
			||||||
    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
 | 
					 | 
				
			||||||
        the asset failed.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    dest_path = (project_path / dest).resolve()
 | 
					 | 
				
			||||||
    if dest_path.exists():
 | 
					 | 
				
			||||||
        # If there's already a file, check for checksum
 | 
					 | 
				
			||||||
        if checksum:
 | 
					 | 
				
			||||||
            if checksum == get_checksum(dest_path):
 | 
					 | 
				
			||||||
                msg.good(f"Skipping download with matching checksum: {dest}")
 | 
					 | 
				
			||||||
                return
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            # If there's not a checksum, make sure the file is a possibly valid size
 | 
					 | 
				
			||||||
            if os.path.getsize(dest_path) == 0:
 | 
					 | 
				
			||||||
                msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
 | 
					 | 
				
			||||||
                os.remove(dest_path)
 | 
					 | 
				
			||||||
    # We might as well support the user here and create parent directories in
 | 
					 | 
				
			||||||
    # case the asset dir isn't listed as a dir to create in the project.yml
 | 
					 | 
				
			||||||
    if not dest_path.parent.exists():
 | 
					 | 
				
			||||||
        dest_path.parent.mkdir(parents=True)
 | 
					 | 
				
			||||||
    with working_dir(project_path):
 | 
					 | 
				
			||||||
        url = convert_asset_url(url)
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            download_file(url, dest_path)
 | 
					 | 
				
			||||||
            msg.good(f"Downloaded asset {dest}")
 | 
					 | 
				
			||||||
        except requests.exceptions.RequestException as e:
 | 
					 | 
				
			||||||
            if Path(url).exists() and Path(url).is_file():
 | 
					 | 
				
			||||||
                # If it's a local file, copy to destination
 | 
					 | 
				
			||||||
                shutil.copy(url, str(dest_path))
 | 
					 | 
				
			||||||
                msg.good(f"Copied local asset {dest}")
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                msg.fail(f"Download failed: {dest}", e)
 | 
					 | 
				
			||||||
    if checksum and checksum != get_checksum(dest_path):
 | 
					 | 
				
			||||||
        msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def convert_asset_url(url: str) -> str:
 | 
					 | 
				
			||||||
    """Check and convert the asset URL if needed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    url (str): The asset URL.
 | 
					 | 
				
			||||||
    RETURNS (str): The converted URL.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # If the asset URL is a regular GitHub URL it's likely a mistake
 | 
					 | 
				
			||||||
    if (
 | 
					 | 
				
			||||||
        re.match(r"(http(s?)):\/\/github.com", url)
 | 
					 | 
				
			||||||
        and "releases/download" not in url
 | 
					 | 
				
			||||||
        and "/raw/" not in url
 | 
					 | 
				
			||||||
    ):
 | 
					 | 
				
			||||||
        converted = url.replace("github.com", "raw.githubusercontent.com")
 | 
					 | 
				
			||||||
        converted = re.sub(r"/(tree|blob)/", "/", converted)
 | 
					 | 
				
			||||||
        msg.warn(
 | 
					 | 
				
			||||||
            "Downloading from a regular GitHub URL. This will only download "
 | 
					 | 
				
			||||||
            "the source of the page, not the actual file. Converting the URL "
 | 
					 | 
				
			||||||
            "to a raw URL.",
 | 
					 | 
				
			||||||
            converted,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        return converted
 | 
					 | 
				
			||||||
    return url
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,124 +0,0 @@
 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ... import about
 | 
					 | 
				
			||||||
from ...util import ensure_path
 | 
					 | 
				
			||||||
from .._util import (
 | 
					 | 
				
			||||||
    COMMAND,
 | 
					 | 
				
			||||||
    PROJECT_FILE,
 | 
					 | 
				
			||||||
    Arg,
 | 
					 | 
				
			||||||
    Opt,
 | 
					 | 
				
			||||||
    get_git_version,
 | 
					 | 
				
			||||||
    git_checkout,
 | 
					 | 
				
			||||||
    git_repo_branch_exists,
 | 
					 | 
				
			||||||
    project_cli,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DEFAULT_REPO = about.__projects__
 | 
					 | 
				
			||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
 | 
					 | 
				
			||||||
DEFAULT_BRANCHES = ["main", "master"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command("clone")
 | 
					 | 
				
			||||||
def project_clone_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    name: str = Arg(..., help="The name of the template to clone"),
 | 
					 | 
				
			||||||
    dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
 | 
					 | 
				
			||||||
    repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
 | 
					 | 
				
			||||||
    branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
 | 
					 | 
				
			||||||
    sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Clone a project template from a repository. Calls into "git" and will
 | 
					 | 
				
			||||||
    only download the files from the given subdirectory. The GitHub repo
 | 
					 | 
				
			||||||
    defaults to the official spaCy template repo, but can be customized
 | 
					 | 
				
			||||||
    (including using a private repo).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-clone
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if dest is None:
 | 
					 | 
				
			||||||
        dest = Path.cwd() / Path(name).parts[-1]
 | 
					 | 
				
			||||||
    if repo == DEFAULT_REPO and branch is None:
 | 
					 | 
				
			||||||
        branch = DEFAULT_PROJECTS_BRANCH
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if branch is None:
 | 
					 | 
				
			||||||
        for default_branch in DEFAULT_BRANCHES:
 | 
					 | 
				
			||||||
            if git_repo_branch_exists(repo, default_branch):
 | 
					 | 
				
			||||||
                branch = default_branch
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
        if branch is None:
 | 
					 | 
				
			||||||
            default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
 | 
					 | 
				
			||||||
            msg.fail(
 | 
					 | 
				
			||||||
                "No branch provided and attempted default "
 | 
					 | 
				
			||||||
                f"branches {default_branches_msg} do not exist.",
 | 
					 | 
				
			||||||
                exits=1,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        if not git_repo_branch_exists(repo, branch):
 | 
					 | 
				
			||||||
            msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
 | 
					 | 
				
			||||||
    assert isinstance(branch, str)
 | 
					 | 
				
			||||||
    project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_clone(
 | 
					 | 
				
			||||||
    name: str,
 | 
					 | 
				
			||||||
    dest: Path,
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    repo: str = about.__projects__,
 | 
					 | 
				
			||||||
    branch: str = about.__projects_branch__,
 | 
					 | 
				
			||||||
    sparse_checkout: bool = False,
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Clone a project template from a repository.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    name (str): Name of subdirectory to clone.
 | 
					 | 
				
			||||||
    dest (Path): Destination path of cloned project.
 | 
					 | 
				
			||||||
    repo (str): URL of Git repo containing project templates.
 | 
					 | 
				
			||||||
    branch (str): The branch to clone from
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    dest = ensure_path(dest)
 | 
					 | 
				
			||||||
    check_clone(name, dest, repo)
 | 
					 | 
				
			||||||
    project_dir = dest.resolve()
 | 
					 | 
				
			||||||
    repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
 | 
					 | 
				
			||||||
    except subprocess.CalledProcessError:
 | 
					 | 
				
			||||||
        err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
 | 
					 | 
				
			||||||
        msg.fail(err, exits=1)
 | 
					 | 
				
			||||||
    msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
 | 
					 | 
				
			||||||
    if not (project_dir / PROJECT_FILE).exists():
 | 
					 | 
				
			||||||
        msg.warn(f"No {PROJECT_FILE} found in directory")
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.good(f"Your project is now ready!")
 | 
					 | 
				
			||||||
        print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def check_clone(name: str, dest: Path, repo: str) -> None:
 | 
					 | 
				
			||||||
    """Check and validate that the destination path can be used to clone. Will
 | 
					 | 
				
			||||||
    check that Git is available and that the destination path is suitable.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    name (str): Name of the directory to clone from the repo.
 | 
					 | 
				
			||||||
    dest (Path): Local destination of cloned directory.
 | 
					 | 
				
			||||||
    repo (str): URL of the repo to clone from.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    git_err = (
 | 
					 | 
				
			||||||
        f"Cloning spaCy project templates requires Git and the 'git' command. "
 | 
					 | 
				
			||||||
        f"To clone a project without Git, copy the files from the '{name}' "
 | 
					 | 
				
			||||||
        f"directory in the {repo} to {dest} manually."
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    get_git_version(error=git_err)
 | 
					 | 
				
			||||||
    if not dest:
 | 
					 | 
				
			||||||
        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
 | 
					 | 
				
			||||||
    if dest.exists():
 | 
					 | 
				
			||||||
        # Directory already exists (not allowed, clone needs to create it)
 | 
					 | 
				
			||||||
        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
 | 
					 | 
				
			||||||
    if not dest.parent.exists():
 | 
					 | 
				
			||||||
        # We're not creating parents, parent dir should exist
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
 | 
					 | 
				
			||||||
            f"Create the necessary folder(s) first before continuing.",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,115 +0,0 @@
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import MarkdownRenderer, msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...util import working_dir
 | 
					 | 
				
			||||||
from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DOCS_URL = "https://spacy.io"
 | 
					 | 
				
			||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
 | 
					 | 
				
			||||||
project, as well as the available commands and workflows. For details, see the
 | 
					 | 
				
			||||||
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
 | 
					 | 
				
			||||||
INTRO_COMMANDS = f"""The following commands are defined by the project. They
 | 
					 | 
				
			||||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
 | 
					 | 
				
			||||||
Commands are only re-run if their inputs have changed."""
 | 
					 | 
				
			||||||
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
 | 
					 | 
				
			||||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
 | 
					 | 
				
			||||||
and will run the specified commands in order. Commands are only re-run if their
 | 
					 | 
				
			||||||
inputs have changed."""
 | 
					 | 
				
			||||||
INTRO_ASSETS = f"""The following assets are defined by the project. They can
 | 
					 | 
				
			||||||
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
 | 
					 | 
				
			||||||
in the project directory."""
 | 
					 | 
				
			||||||
# These markers are added to the Markdown and can be used to update the file in
 | 
					 | 
				
			||||||
# place if it already exists. Only the auto-generated part will be replaced.
 | 
					 | 
				
			||||||
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
 | 
					 | 
				
			||||||
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
 | 
					 | 
				
			||||||
# If this marker is used in an existing README, it's ignored and not replaced
 | 
					 | 
				
			||||||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command("document")
 | 
					 | 
				
			||||||
def project_document_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
 | 
					 | 
				
			||||||
    output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
 | 
					 | 
				
			||||||
    no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Auto-generate a README.md for a project. If the content is saved to a file,
 | 
					 | 
				
			||||||
    hidden markers are added so you can add custom content before or after the
 | 
					 | 
				
			||||||
    auto-generated section and only the auto-generated docs will be replaced
 | 
					 | 
				
			||||||
    when you re-run the command.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-document
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    project_document(project_dir, output_file, no_emoji=no_emoji)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_document(
 | 
					 | 
				
			||||||
    project_dir: Path, output_file: Path, *, no_emoji: bool = False
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    is_stdout = str(output_file) == "-"
 | 
					 | 
				
			||||||
    config = load_project_config(project_dir)
 | 
					 | 
				
			||||||
    md = MarkdownRenderer(no_emoji=no_emoji)
 | 
					 | 
				
			||||||
    md.add(MARKER_START)
 | 
					 | 
				
			||||||
    title = config.get("title")
 | 
					 | 
				
			||||||
    description = config.get("description")
 | 
					 | 
				
			||||||
    md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
 | 
					 | 
				
			||||||
    if description:
 | 
					 | 
				
			||||||
        md.add(description)
 | 
					 | 
				
			||||||
    md.add(md.title(2, PROJECT_FILE, "📋"))
 | 
					 | 
				
			||||||
    md.add(INTRO_PROJECT)
 | 
					 | 
				
			||||||
    # Commands
 | 
					 | 
				
			||||||
    cmds = config.get("commands", [])
 | 
					 | 
				
			||||||
    data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
 | 
					 | 
				
			||||||
    if data:
 | 
					 | 
				
			||||||
        md.add(md.title(3, "Commands", "⏯"))
 | 
					 | 
				
			||||||
        md.add(INTRO_COMMANDS)
 | 
					 | 
				
			||||||
        md.add(md.table(data, ["Command", "Description"]))
 | 
					 | 
				
			||||||
    # Workflows
 | 
					 | 
				
			||||||
    wfs = config.get("workflows", {}).items()
 | 
					 | 
				
			||||||
    data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
 | 
					 | 
				
			||||||
    if data:
 | 
					 | 
				
			||||||
        md.add(md.title(3, "Workflows", "⏭"))
 | 
					 | 
				
			||||||
        md.add(INTRO_WORKFLOWS)
 | 
					 | 
				
			||||||
        md.add(md.table(data, ["Workflow", "Steps"]))
 | 
					 | 
				
			||||||
    # Assets
 | 
					 | 
				
			||||||
    assets = config.get("assets", [])
 | 
					 | 
				
			||||||
    data = []
 | 
					 | 
				
			||||||
    for a in assets:
 | 
					 | 
				
			||||||
        source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
 | 
					 | 
				
			||||||
        dest_path = a["dest"]
 | 
					 | 
				
			||||||
        dest = md.code(dest_path)
 | 
					 | 
				
			||||||
        if source == "Local":
 | 
					 | 
				
			||||||
            # Only link assets if they're in the repo
 | 
					 | 
				
			||||||
            with working_dir(project_dir) as p:
 | 
					 | 
				
			||||||
                if (p / dest_path).exists():
 | 
					 | 
				
			||||||
                    dest = md.link(dest, dest_path)
 | 
					 | 
				
			||||||
        data.append((dest, source, a.get("description", "")))
 | 
					 | 
				
			||||||
    if data:
 | 
					 | 
				
			||||||
        md.add(md.title(3, "Assets", "🗂"))
 | 
					 | 
				
			||||||
        md.add(INTRO_ASSETS)
 | 
					 | 
				
			||||||
        md.add(md.table(data, ["File", "Source", "Description"]))
 | 
					 | 
				
			||||||
    md.add(MARKER_END)
 | 
					 | 
				
			||||||
    # Output result
 | 
					 | 
				
			||||||
    if is_stdout:
 | 
					 | 
				
			||||||
        print(md.text)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        content = md.text
 | 
					 | 
				
			||||||
        if output_file.exists():
 | 
					 | 
				
			||||||
            with output_file.open("r", encoding="utf8") as f:
 | 
					 | 
				
			||||||
                existing = f.read()
 | 
					 | 
				
			||||||
            if MARKER_IGNORE in existing:
 | 
					 | 
				
			||||||
                msg.warn("Found ignore marker in existing file: skipping", output_file)
 | 
					 | 
				
			||||||
                return
 | 
					 | 
				
			||||||
            if MARKER_START in existing and MARKER_END in existing:
 | 
					 | 
				
			||||||
                msg.info("Found existing file: only replacing auto-generated docs")
 | 
					 | 
				
			||||||
                before = existing.split(MARKER_START)[0]
 | 
					 | 
				
			||||||
                after = existing.split(MARKER_END)[1]
 | 
					 | 
				
			||||||
                content = f"{before}{content}{after}"
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                msg.warn("Replacing existing file")
 | 
					 | 
				
			||||||
        with output_file.open("w", encoding="utf8") as f:
 | 
					 | 
				
			||||||
            f.write(content)
 | 
					 | 
				
			||||||
        msg.good("Saved project documentation", output_file)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,220 +0,0 @@
 | 
				
			||||||
"""This module contains helpers and subcommands for integrating spaCy projects
 | 
					 | 
				
			||||||
with Data Version Controk (DVC). https://dvc.org"""
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from typing import Any, Dict, Iterable, List, Optional
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...util import (
 | 
					 | 
				
			||||||
    SimpleFrozenList,
 | 
					 | 
				
			||||||
    join_command,
 | 
					 | 
				
			||||||
    run_command,
 | 
					 | 
				
			||||||
    split_command,
 | 
					 | 
				
			||||||
    working_dir,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from .._util import (
 | 
					 | 
				
			||||||
    COMMAND,
 | 
					 | 
				
			||||||
    NAME,
 | 
					 | 
				
			||||||
    PROJECT_FILE,
 | 
					 | 
				
			||||||
    Arg,
 | 
					 | 
				
			||||||
    Opt,
 | 
					 | 
				
			||||||
    get_hash,
 | 
					 | 
				
			||||||
    load_project_config,
 | 
					 | 
				
			||||||
    project_cli,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
DVC_CONFIG = "dvc.yaml"
 | 
					 | 
				
			||||||
DVC_DIR = ".dvc"
 | 
					 | 
				
			||||||
UPDATE_COMMAND = "dvc"
 | 
					 | 
				
			||||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
 | 
					 | 
				
			||||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
 | 
					 | 
				
			||||||
# {COMMAND} project {UPDATE_COMMAND}"""
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command(UPDATE_COMMAND)
 | 
					 | 
				
			||||||
def project_update_dvc_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
 | 
					 | 
				
			||||||
    workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
 | 
					 | 
				
			||||||
    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
 | 
					 | 
				
			||||||
    quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
 | 
					 | 
				
			||||||
    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Auto-generate Data Version Control (DVC) config. A DVC
 | 
					 | 
				
			||||||
    project can only define one pipeline, so you need to specify one workflow
 | 
					 | 
				
			||||||
    defined in the project.yml. If no workflow is specified, the first defined
 | 
					 | 
				
			||||||
    workflow is used. The DVC config will only be updated if the project.yml
 | 
					 | 
				
			||||||
    changed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-dvc
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_update_dvc(
 | 
					 | 
				
			||||||
    project_dir: Path,
 | 
					 | 
				
			||||||
    workflow: Optional[str] = None,
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    verbose: bool = False,
 | 
					 | 
				
			||||||
    quiet: bool = False,
 | 
					 | 
				
			||||||
    force: bool = False,
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Update the auto-generated Data Version Control (DVC) config file. A DVC
 | 
					 | 
				
			||||||
    project can only define one pipeline, so you need to specify one workflow
 | 
					 | 
				
			||||||
    defined in the project.yml. Will only update the file if the checksum changed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): The project directory.
 | 
					 | 
				
			||||||
    workflow (Optional[str]): Optional name of workflow defined in project.yml.
 | 
					 | 
				
			||||||
        If not set, the first workflow will be used.
 | 
					 | 
				
			||||||
    verbose (bool): Print more info.
 | 
					 | 
				
			||||||
    quiet (bool): Print less info.
 | 
					 | 
				
			||||||
    force (bool): Force update DVC config.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config = load_project_config(project_dir)
 | 
					 | 
				
			||||||
    updated = update_dvc_config(
 | 
					 | 
				
			||||||
        project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    help_msg = "To execute the workflow with DVC, run: dvc repro"
 | 
					 | 
				
			||||||
    if updated:
 | 
					 | 
				
			||||||
        msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def update_dvc_config(
 | 
					 | 
				
			||||||
    path: Path,
 | 
					 | 
				
			||||||
    config: Dict[str, Any],
 | 
					 | 
				
			||||||
    workflow: Optional[str] = None,
 | 
					 | 
				
			||||||
    verbose: bool = False,
 | 
					 | 
				
			||||||
    quiet: bool = False,
 | 
					 | 
				
			||||||
    force: bool = False,
 | 
					 | 
				
			||||||
) -> bool:
 | 
					 | 
				
			||||||
    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
 | 
					 | 
				
			||||||
    project directory. The file is auto-generated based on the config. The
 | 
					 | 
				
			||||||
    first line of the auto-generated file specifies the hash of the config
 | 
					 | 
				
			||||||
    dict, so if any of the config values change, the DVC config is regenerated.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    path (Path): The path to the project directory.
 | 
					 | 
				
			||||||
    config (Dict[str, Any]): The loaded project.yml.
 | 
					 | 
				
			||||||
    verbose (bool): Whether to print additional info (via DVC).
 | 
					 | 
				
			||||||
    quiet (bool): Don't output anything (via DVC).
 | 
					 | 
				
			||||||
    force (bool): Force update, even if hashes match.
 | 
					 | 
				
			||||||
    RETURNS (bool): Whether the DVC config file was updated.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    ensure_dvc(path)
 | 
					 | 
				
			||||||
    workflows = config.get("workflows", {})
 | 
					 | 
				
			||||||
    workflow_names = list(workflows.keys())
 | 
					 | 
				
			||||||
    check_workflows(workflow_names, workflow)
 | 
					 | 
				
			||||||
    if not workflow:
 | 
					 | 
				
			||||||
        workflow = workflow_names[0]
 | 
					 | 
				
			||||||
    config_hash = get_hash(config)
 | 
					 | 
				
			||||||
    path = path.resolve()
 | 
					 | 
				
			||||||
    dvc_config_path = path / DVC_CONFIG
 | 
					 | 
				
			||||||
    if dvc_config_path.exists():
 | 
					 | 
				
			||||||
        # Check if the file was generated using the current config, if not, redo
 | 
					 | 
				
			||||||
        with dvc_config_path.open("r", encoding="utf8") as f:
 | 
					 | 
				
			||||||
            ref_hash = f.readline().strip().replace("# ", "")
 | 
					 | 
				
			||||||
        if ref_hash == config_hash and not force:
 | 
					 | 
				
			||||||
            return False  # Nothing has changed in project.yml, don't need to update
 | 
					 | 
				
			||||||
        dvc_config_path.unlink()
 | 
					 | 
				
			||||||
    dvc_commands = []
 | 
					 | 
				
			||||||
    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # some flags that apply to every command
 | 
					 | 
				
			||||||
    flags = []
 | 
					 | 
				
			||||||
    if verbose:
 | 
					 | 
				
			||||||
        flags.append("--verbose")
 | 
					 | 
				
			||||||
    if quiet:
 | 
					 | 
				
			||||||
        flags.append("--quiet")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for name in workflows[workflow]:
 | 
					 | 
				
			||||||
        command = config_commands[name]
 | 
					 | 
				
			||||||
        deps = command.get("deps", [])
 | 
					 | 
				
			||||||
        outputs = command.get("outputs", [])
 | 
					 | 
				
			||||||
        outputs_no_cache = command.get("outputs_no_cache", [])
 | 
					 | 
				
			||||||
        if not deps and not outputs and not outputs_no_cache:
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        # Default to the working dir as the project path since dvc.yaml is auto-generated
 | 
					 | 
				
			||||||
        # and we don't want arbitrary paths in there
 | 
					 | 
				
			||||||
        project_cmd = ["python", "-m", NAME, "project", "run", name]
 | 
					 | 
				
			||||||
        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
 | 
					 | 
				
			||||||
        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
 | 
					 | 
				
			||||||
        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
 | 
					 | 
				
			||||||
        if command.get("no_skip"):
 | 
					 | 
				
			||||||
            dvc_cmd.append("--always-changed")
 | 
					 | 
				
			||||||
        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
 | 
					 | 
				
			||||||
        dvc_commands.append(join_command(full_cmd))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if not dvc_commands:
 | 
					 | 
				
			||||||
        # If we don't check for this, then there will be an error when reading the
 | 
					 | 
				
			||||||
        # config, since DVC wouldn't create it.
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "No usable commands for DVC found. This can happen if none of your "
 | 
					 | 
				
			||||||
            "commands have dependencies or outputs.",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    with working_dir(path):
 | 
					 | 
				
			||||||
        for c in dvc_commands:
 | 
					 | 
				
			||||||
            dvc_command = "dvc " + c
 | 
					 | 
				
			||||||
            run_command(dvc_command)
 | 
					 | 
				
			||||||
    with dvc_config_path.open("r+", encoding="utf8") as f:
 | 
					 | 
				
			||||||
        content = f.read()
 | 
					 | 
				
			||||||
        f.seek(0, 0)
 | 
					 | 
				
			||||||
        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
 | 
					 | 
				
			||||||
    return True
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
 | 
					 | 
				
			||||||
    """Validate workflows provided in project.yml and check that a given
 | 
					 | 
				
			||||||
    workflow can be used to generate a DVC config.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    workflows (List[str]): Names of the available workflows.
 | 
					 | 
				
			||||||
    workflow (Optional[str]): The name of the workflow to convert.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if not workflows:
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
 | 
					 | 
				
			||||||
            f"define at least one list of commands.",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if workflow is not None and workflow not in workflows:
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
 | 
					 | 
				
			||||||
            f"Available workflows: {', '.join(workflows)}",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if not workflow:
 | 
					 | 
				
			||||||
        msg.warn(
 | 
					 | 
				
			||||||
            f"No workflow specified for DVC pipeline. Using the first workflow "
 | 
					 | 
				
			||||||
            f"defined in {PROJECT_FILE}: '{workflows[0]}'"
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def ensure_dvc(project_dir: Path) -> None:
 | 
					 | 
				
			||||||
    """Ensure that the "dvc" command is available and that the current project
 | 
					 | 
				
			||||||
    directory is an initialized DVC project.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
 | 
					 | 
				
			||||||
    except Exception:
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "To use spaCy projects with DVC (Data Version Control), DVC needs "
 | 
					 | 
				
			||||||
            "to be installed and the 'dvc' command needs to be available",
 | 
					 | 
				
			||||||
            "You can install the Python package from pip (pip install dvc) or "
 | 
					 | 
				
			||||||
            "conda (conda install -c conda-forge dvc). For more details, see the "
 | 
					 | 
				
			||||||
            "documentation: https://dvc.org/doc/install",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    if not (project_dir / ".dvc").exists():
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            "Project not initialized as a DVC project",
 | 
					 | 
				
			||||||
            "To initialize a DVC project, you can run 'dvc init' in the project "
 | 
					 | 
				
			||||||
            "directory. For more details, see the documentation: "
 | 
					 | 
				
			||||||
            "https://dvc.org/doc/command-reference/init",
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,67 +0,0 @@
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .._util import Arg, load_project_config, logger, project_cli
 | 
					 | 
				
			||||||
from .remote_storage import RemoteStorage, get_command_hash
 | 
					 | 
				
			||||||
from .run import update_lockfile
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command("pull")
 | 
					 | 
				
			||||||
def project_pull_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    remote: str = Arg("default", help="Name or path of remote storage"),
 | 
					 | 
				
			||||||
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Retrieve available precomputed outputs from a remote storage.
 | 
					 | 
				
			||||||
    You can alias remotes in your project.yml by mapping them to storage paths.
 | 
					 | 
				
			||||||
    A storage can be anything that the smart-open library can upload to, e.g.
 | 
					 | 
				
			||||||
    AWS, Google Cloud Storage, SSH, local directories etc.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-pull
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    for url, output_path in project_pull(project_dir, remote):
 | 
					 | 
				
			||||||
        if url is not None:
 | 
					 | 
				
			||||||
            msg.good(f"Pulled {output_path} from {url}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
 | 
					 | 
				
			||||||
    # TODO: We don't have tests for this :(. It would take a bit of mockery to
 | 
					 | 
				
			||||||
    # set up. I guess see if it breaks first?
 | 
					 | 
				
			||||||
    config = load_project_config(project_dir)
 | 
					 | 
				
			||||||
    if remote in config.get("remotes", {}):
 | 
					 | 
				
			||||||
        remote = config["remotes"][remote]
 | 
					 | 
				
			||||||
    storage = RemoteStorage(project_dir, remote)
 | 
					 | 
				
			||||||
    commands = list(config.get("commands", []))
 | 
					 | 
				
			||||||
    # We use a while loop here because we don't know how the commands
 | 
					 | 
				
			||||||
    # will be ordered. A command might need dependencies from one that's later
 | 
					 | 
				
			||||||
    # in the list.
 | 
					 | 
				
			||||||
    while commands:
 | 
					 | 
				
			||||||
        for i, cmd in enumerate(list(commands)):
 | 
					 | 
				
			||||||
            logger.debug("CMD: %s.", cmd["name"])
 | 
					 | 
				
			||||||
            deps = [project_dir / dep for dep in cmd.get("deps", [])]
 | 
					 | 
				
			||||||
            if all(dep.exists() for dep in deps):
 | 
					 | 
				
			||||||
                cmd_hash = get_command_hash("", "", deps, cmd["script"])
 | 
					 | 
				
			||||||
                for output_path in cmd.get("outputs", []):
 | 
					 | 
				
			||||||
                    url = storage.pull(output_path, command_hash=cmd_hash)
 | 
					 | 
				
			||||||
                    logger.debug(
 | 
					 | 
				
			||||||
                        "URL: %s for %s with command hash %s",
 | 
					 | 
				
			||||||
                        url,
 | 
					 | 
				
			||||||
                        output_path,
 | 
					 | 
				
			||||||
                        cmd_hash,
 | 
					 | 
				
			||||||
                    )
 | 
					 | 
				
			||||||
                    yield url, output_path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                out_locs = [project_dir / out for out in cmd.get("outputs", [])]
 | 
					 | 
				
			||||||
                if all(loc.exists() for loc in out_locs):
 | 
					 | 
				
			||||||
                    update_lockfile(project_dir, cmd)
 | 
					 | 
				
			||||||
                # We remove the command from the list here, and break, so that
 | 
					 | 
				
			||||||
                # we iterate over the loop again.
 | 
					 | 
				
			||||||
                commands.pop(i)
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            # If we didn't break the for loop, break the while loop.
 | 
					 | 
				
			||||||
            break
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,69 +0,0 @@
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .._util import Arg, load_project_config, logger, project_cli
 | 
					 | 
				
			||||||
from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command("push")
 | 
					 | 
				
			||||||
def project_push_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    remote: str = Arg("default", help="Name or path of remote storage"),
 | 
					 | 
				
			||||||
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Persist outputs to a remote storage. You can alias remotes in your
 | 
					 | 
				
			||||||
    project.yml by mapping them to storage paths. A storage can be anything that
 | 
					 | 
				
			||||||
    the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
 | 
					 | 
				
			||||||
    local directories etc.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-push
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    for output_path, url in project_push(project_dir, remote):
 | 
					 | 
				
			||||||
        if url is None:
 | 
					 | 
				
			||||||
            msg.info(f"Skipping {output_path}")
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            msg.good(f"Pushed {output_path} to {url}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_push(project_dir: Path, remote: str):
 | 
					 | 
				
			||||||
    """Persist outputs to a remote storage. You can alias remotes in your project.yml
 | 
					 | 
				
			||||||
    by mapping them to storage paths. A storage can be anything that the smart-open
 | 
					 | 
				
			||||||
    library can upload to, e.g. gcs, aws, ssh, local directories etc
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config = load_project_config(project_dir)
 | 
					 | 
				
			||||||
    if remote in config.get("remotes", {}):
 | 
					 | 
				
			||||||
        remote = config["remotes"][remote]
 | 
					 | 
				
			||||||
    storage = RemoteStorage(project_dir, remote)
 | 
					 | 
				
			||||||
    for cmd in config.get("commands", []):
 | 
					 | 
				
			||||||
        logger.debug("CMD: %s", cmd["name"])
 | 
					 | 
				
			||||||
        deps = [project_dir / dep for dep in cmd.get("deps", [])]
 | 
					 | 
				
			||||||
        if any(not dep.exists() for dep in deps):
 | 
					 | 
				
			||||||
            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
 | 
					 | 
				
			||||||
            continue
 | 
					 | 
				
			||||||
        cmd_hash = get_command_hash(
 | 
					 | 
				
			||||||
            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        logger.debug("CMD_HASH: %s", cmd_hash)
 | 
					 | 
				
			||||||
        for output_path in cmd.get("outputs", []):
 | 
					 | 
				
			||||||
            output_loc = project_dir / output_path
 | 
					 | 
				
			||||||
            if output_loc.exists() and _is_not_empty_dir(output_loc):
 | 
					 | 
				
			||||||
                url = storage.push(
 | 
					 | 
				
			||||||
                    output_path,
 | 
					 | 
				
			||||||
                    command_hash=cmd_hash,
 | 
					 | 
				
			||||||
                    content_hash=get_content_hash(output_loc),
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
                logger.debug(
 | 
					 | 
				
			||||||
                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
                yield output_path, url
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _is_not_empty_dir(loc: Path):
 | 
					 | 
				
			||||||
    if not loc.is_dir():
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        return False
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,212 +0,0 @@
 | 
				
			||||||
import hashlib
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import site
 | 
					 | 
				
			||||||
import tarfile
 | 
					 | 
				
			||||||
import urllib.parse
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from typing import TYPE_CHECKING, Dict, List, Optional
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ... import about
 | 
					 | 
				
			||||||
from ...errors import Errors
 | 
					 | 
				
			||||||
from ...git_info import GIT_VERSION
 | 
					 | 
				
			||||||
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
 | 
					 | 
				
			||||||
from .._util import (
 | 
					 | 
				
			||||||
    download_file,
 | 
					 | 
				
			||||||
    ensure_pathy,
 | 
					 | 
				
			||||||
    get_checksum,
 | 
					 | 
				
			||||||
    get_hash,
 | 
					 | 
				
			||||||
    make_tempdir,
 | 
					 | 
				
			||||||
    upload_file,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if TYPE_CHECKING:
 | 
					 | 
				
			||||||
    from pathy import FluidPath  # noqa: F401
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class RemoteStorage:
 | 
					 | 
				
			||||||
    """Push and pull outputs to and from a remote file storage.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
 | 
					 | 
				
			||||||
    ssh, etc.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, project_root: Path, url: str, *, compression="gz"):
 | 
					 | 
				
			||||||
        self.root = project_root
 | 
					 | 
				
			||||||
        self.url = ensure_pathy(url)
 | 
					 | 
				
			||||||
        self.compression = compression
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
 | 
					 | 
				
			||||||
        """Compress a file or directory within a project and upload it to a remote
 | 
					 | 
				
			||||||
        storage. If an object exists at the full URL, nothing is done.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Within the remote storage, files are addressed by their project path
 | 
					 | 
				
			||||||
        (url encoded) and two user-supplied hashes, representing their creation
 | 
					 | 
				
			||||||
        context and their file contents. If the URL already exists, the data is
 | 
					 | 
				
			||||||
        not uploaded. Paths are archived and compressed prior to upload.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        loc = self.root / path
 | 
					 | 
				
			||||||
        if not loc.exists():
 | 
					 | 
				
			||||||
            raise IOError(f"Cannot push {loc}: does not exist.")
 | 
					 | 
				
			||||||
        url = self.make_url(path, command_hash, content_hash)
 | 
					 | 
				
			||||||
        if url.exists():
 | 
					 | 
				
			||||||
            return url
 | 
					 | 
				
			||||||
        tmp: Path
 | 
					 | 
				
			||||||
        with make_tempdir() as tmp:
 | 
					 | 
				
			||||||
            tar_loc = tmp / self.encode_name(str(path))
 | 
					 | 
				
			||||||
            mode_string = f"w:{self.compression}" if self.compression else "w"
 | 
					 | 
				
			||||||
            with tarfile.open(tar_loc, mode=mode_string) as tar_file:
 | 
					 | 
				
			||||||
                tar_file.add(str(loc), arcname=str(path))
 | 
					 | 
				
			||||||
            upload_file(tar_loc, url)
 | 
					 | 
				
			||||||
        return url
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def pull(
 | 
					 | 
				
			||||||
        self,
 | 
					 | 
				
			||||||
        path: Path,
 | 
					 | 
				
			||||||
        *,
 | 
					 | 
				
			||||||
        command_hash: Optional[str] = None,
 | 
					 | 
				
			||||||
        content_hash: Optional[str] = None,
 | 
					 | 
				
			||||||
    ) -> Optional["FluidPath"]:
 | 
					 | 
				
			||||||
        """Retrieve a file from the remote cache. If the file already exists,
 | 
					 | 
				
			||||||
        nothing is done.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        If the command_hash and/or content_hash are specified, only matching
 | 
					 | 
				
			||||||
        results are returned. If no results are available, an error is raised.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        dest = self.root / path
 | 
					 | 
				
			||||||
        if dest.exists():
 | 
					 | 
				
			||||||
            return None
 | 
					 | 
				
			||||||
        url = self.find(path, command_hash=command_hash, content_hash=content_hash)
 | 
					 | 
				
			||||||
        if url is None:
 | 
					 | 
				
			||||||
            return url
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            # Make sure the destination exists
 | 
					 | 
				
			||||||
            if not dest.parent.exists():
 | 
					 | 
				
			||||||
                dest.parent.mkdir(parents=True)
 | 
					 | 
				
			||||||
            tmp: Path
 | 
					 | 
				
			||||||
            with make_tempdir() as tmp:
 | 
					 | 
				
			||||||
                tar_loc = tmp / url.parts[-1]
 | 
					 | 
				
			||||||
                download_file(url, tar_loc)
 | 
					 | 
				
			||||||
                mode_string = f"r:{self.compression}" if self.compression else "r"
 | 
					 | 
				
			||||||
                with tarfile.open(tar_loc, mode=mode_string) as tar_file:
 | 
					 | 
				
			||||||
                    # This requires that the path is added correctly, relative
 | 
					 | 
				
			||||||
                    # to root. This is how we set things up in push()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    # Disallow paths outside the current directory for the tar
 | 
					 | 
				
			||||||
                    # file (CVE-2007-4559, directory traversal vulnerability)
 | 
					 | 
				
			||||||
                    def is_within_directory(directory, target):
 | 
					 | 
				
			||||||
                        abs_directory = os.path.abspath(directory)
 | 
					 | 
				
			||||||
                        abs_target = os.path.abspath(target)
 | 
					 | 
				
			||||||
                        prefix = os.path.commonprefix([abs_directory, abs_target])
 | 
					 | 
				
			||||||
                        return prefix == abs_directory
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    def safe_extract(tar, path):
 | 
					 | 
				
			||||||
                        for member in tar.getmembers():
 | 
					 | 
				
			||||||
                            member_path = os.path.join(path, member.name)
 | 
					 | 
				
			||||||
                            if not is_within_directory(path, member_path):
 | 
					 | 
				
			||||||
                                raise ValueError(Errors.E852)
 | 
					 | 
				
			||||||
                        tar.extractall(path)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    safe_extract(tar_file, self.root)
 | 
					 | 
				
			||||||
        return url
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def find(
 | 
					 | 
				
			||||||
        self,
 | 
					 | 
				
			||||||
        path: Path,
 | 
					 | 
				
			||||||
        *,
 | 
					 | 
				
			||||||
        command_hash: Optional[str] = None,
 | 
					 | 
				
			||||||
        content_hash: Optional[str] = None,
 | 
					 | 
				
			||||||
    ) -> Optional["FluidPath"]:
 | 
					 | 
				
			||||||
        """Find the best matching version of a file within the storage,
 | 
					 | 
				
			||||||
        or `None` if no match can be found. If both the creation and content hash
 | 
					 | 
				
			||||||
        are specified, only exact matches will be returned. Otherwise, the most
 | 
					 | 
				
			||||||
        recent matching file is preferred.
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        name = self.encode_name(str(path))
 | 
					 | 
				
			||||||
        urls = []
 | 
					 | 
				
			||||||
        if command_hash is not None and content_hash is not None:
 | 
					 | 
				
			||||||
            url = self.url / name / command_hash / content_hash
 | 
					 | 
				
			||||||
            urls = [url] if url.exists() else []
 | 
					 | 
				
			||||||
        elif command_hash is not None:
 | 
					 | 
				
			||||||
            if (self.url / name / command_hash).exists():
 | 
					 | 
				
			||||||
                urls = list((self.url / name / command_hash).iterdir())
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            if (self.url / name).exists():
 | 
					 | 
				
			||||||
                for sub_dir in (self.url / name).iterdir():
 | 
					 | 
				
			||||||
                    urls.extend(sub_dir.iterdir())
 | 
					 | 
				
			||||||
                if content_hash is not None:
 | 
					 | 
				
			||||||
                    urls = [url for url in urls if url.parts[-1] == content_hash]
 | 
					 | 
				
			||||||
        if len(urls) >= 2:
 | 
					 | 
				
			||||||
            try:
 | 
					 | 
				
			||||||
                urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore
 | 
					 | 
				
			||||||
            except Exception:
 | 
					 | 
				
			||||||
                msg.warn(
 | 
					 | 
				
			||||||
                    "Unable to sort remote files by last modified. The file(s) "
 | 
					 | 
				
			||||||
                    "pulled from the cache may not be the most recent."
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
        return urls[-1] if urls else None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
 | 
					 | 
				
			||||||
        """Construct a URL from a subpath, a creation hash and a content hash."""
 | 
					 | 
				
			||||||
        return self.url / self.encode_name(str(path)) / command_hash / content_hash
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def encode_name(self, name: str) -> str:
 | 
					 | 
				
			||||||
        """Encode a subpath into a URL-safe name."""
 | 
					 | 
				
			||||||
        return urllib.parse.quote_plus(name)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_content_hash(loc: Path) -> str:
 | 
					 | 
				
			||||||
    return get_checksum(loc)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_command_hash(
 | 
					 | 
				
			||||||
    site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
 | 
					 | 
				
			||||||
) -> str:
 | 
					 | 
				
			||||||
    """Create a hash representing the execution of a command. This includes the
 | 
					 | 
				
			||||||
    currently installed packages, whatever environment variables have been marked
 | 
					 | 
				
			||||||
    as relevant, and the command.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
 | 
					 | 
				
			||||||
        spacy_v = GIT_VERSION
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        spacy_v = str(get_minor_version(about.__version__) or "")
 | 
					 | 
				
			||||||
    dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
 | 
					 | 
				
			||||||
    hashes = [spacy_v, site_hash, env_hash] + dep_checksums
 | 
					 | 
				
			||||||
    hashes.extend(cmd)
 | 
					 | 
				
			||||||
    creation_bytes = "".join(hashes).encode("utf8")
 | 
					 | 
				
			||||||
    return hashlib.md5(creation_bytes).hexdigest()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_site_hash():
 | 
					 | 
				
			||||||
    """Hash the current Python environment's site-packages contents, including
 | 
					 | 
				
			||||||
    the name and version of the libraries. The list we're hashing is what
 | 
					 | 
				
			||||||
    `pip freeze` would output.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    site_dirs = site.getsitepackages()
 | 
					 | 
				
			||||||
    if site.ENABLE_USER_SITE:
 | 
					 | 
				
			||||||
        site_dirs.extend(site.getusersitepackages())
 | 
					 | 
				
			||||||
    packages = set()
 | 
					 | 
				
			||||||
    for site_dir in site_dirs:
 | 
					 | 
				
			||||||
        site_dir = Path(site_dir)
 | 
					 | 
				
			||||||
        for subpath in site_dir.iterdir():
 | 
					 | 
				
			||||||
            if subpath.parts[-1].endswith("dist-info"):
 | 
					 | 
				
			||||||
                packages.add(subpath.parts[-1].replace(".dist-info", ""))
 | 
					 | 
				
			||||||
    package_bytes = "".join(sorted(packages)).encode("utf8")
 | 
					 | 
				
			||||||
    return hashlib.md5sum(package_bytes).hexdigest()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_env_hash(env: Dict[str, str]) -> str:
 | 
					 | 
				
			||||||
    """Construct a hash of the environment variables that will be passed into
 | 
					 | 
				
			||||||
    the commands.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Values in the env dict may be references to the current os.environ, using
 | 
					 | 
				
			||||||
    the syntax $ENV_VAR to mean os.environ[ENV_VAR]
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    env_vars = {}
 | 
					 | 
				
			||||||
    for key, value in env.items():
 | 
					 | 
				
			||||||
        if value.startswith("$"):
 | 
					 | 
				
			||||||
            env_vars[key] = os.environ.get(value[1:], "")
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            env_vars[key] = value
 | 
					 | 
				
			||||||
    return get_hash(env_vars)
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,379 +0,0 @@
 | 
				
			||||||
import os.path
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import srsly
 | 
					 | 
				
			||||||
import typer
 | 
					 | 
				
			||||||
from wasabi import msg
 | 
					 | 
				
			||||||
from wasabi.util import locale_escape
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ... import about
 | 
					 | 
				
			||||||
from ...git_info import GIT_VERSION
 | 
					 | 
				
			||||||
from ...util import (
 | 
					 | 
				
			||||||
    ENV_VARS,
 | 
					 | 
				
			||||||
    SimpleFrozenDict,
 | 
					 | 
				
			||||||
    SimpleFrozenList,
 | 
					 | 
				
			||||||
    check_bool_env_var,
 | 
					 | 
				
			||||||
    is_cwd,
 | 
					 | 
				
			||||||
    is_minor_version_match,
 | 
					 | 
				
			||||||
    join_command,
 | 
					 | 
				
			||||||
    run_command,
 | 
					 | 
				
			||||||
    split_command,
 | 
					 | 
				
			||||||
    working_dir,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from .._util import (
 | 
					 | 
				
			||||||
    COMMAND,
 | 
					 | 
				
			||||||
    PROJECT_FILE,
 | 
					 | 
				
			||||||
    PROJECT_LOCK,
 | 
					 | 
				
			||||||
    Arg,
 | 
					 | 
				
			||||||
    Opt,
 | 
					 | 
				
			||||||
    get_checksum,
 | 
					 | 
				
			||||||
    get_hash,
 | 
					 | 
				
			||||||
    load_project_config,
 | 
					 | 
				
			||||||
    parse_config_overrides,
 | 
					 | 
				
			||||||
    project_cli,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@project_cli.command(
 | 
					 | 
				
			||||||
    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def project_run_cli(
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    ctx: typer.Context,  # This is only used to read additional arguments
 | 
					 | 
				
			||||||
    subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
 | 
					 | 
				
			||||||
    project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
 | 
					 | 
				
			||||||
    force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
 | 
					 | 
				
			||||||
    dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
 | 
					 | 
				
			||||||
    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    """Run a named command or workflow defined in the project.yml. If a workflow
 | 
					 | 
				
			||||||
    name is specified, all commands in the workflow are run, in order. If
 | 
					 | 
				
			||||||
    commands define dependencies and/or outputs, they will only be re-run if
 | 
					 | 
				
			||||||
    state has changed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    DOCS: https://spacy.io/api/cli#project-run
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if show_help or not subcommand:
 | 
					 | 
				
			||||||
        print_run_help(project_dir, subcommand)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        overrides = parse_config_overrides(ctx.args)
 | 
					 | 
				
			||||||
        project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def project_run(
 | 
					 | 
				
			||||||
    project_dir: Path,
 | 
					 | 
				
			||||||
    subcommand: str,
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    overrides: Dict[str, Any] = SimpleFrozenDict(),
 | 
					 | 
				
			||||||
    force: bool = False,
 | 
					 | 
				
			||||||
    dry: bool = False,
 | 
					 | 
				
			||||||
    capture: bool = False,
 | 
					 | 
				
			||||||
    skip_requirements_check: bool = False,
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Run a named script defined in the project.yml. If the script is part
 | 
					 | 
				
			||||||
    of the default pipeline (defined in the "run" section), DVC is used to
 | 
					 | 
				
			||||||
    execute the command, so it can determine whether to rerun it. It then
 | 
					 | 
				
			||||||
    calls into "exec" to execute it.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): Path to project directory.
 | 
					 | 
				
			||||||
    subcommand (str): Name of command to run.
 | 
					 | 
				
			||||||
    overrides (Dict[str, Any]): Optional config overrides.
 | 
					 | 
				
			||||||
    force (bool): Force re-running, even if nothing changed.
 | 
					 | 
				
			||||||
    dry (bool): Perform a dry run and don't execute commands.
 | 
					 | 
				
			||||||
    capture (bool): Whether to capture the output and errors of individual commands.
 | 
					 | 
				
			||||||
        If False, the stdout and stderr will not be redirected, and if there's an error,
 | 
					 | 
				
			||||||
        sys.exit will be called with the return code. You should use capture=False
 | 
					 | 
				
			||||||
        when you want to turn over execution to the command, and capture=True
 | 
					 | 
				
			||||||
        when you want to run the command more like a function.
 | 
					 | 
				
			||||||
    skip_requirements_check (bool): Whether to skip the requirements check.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config = load_project_config(project_dir, overrides=overrides)
 | 
					 | 
				
			||||||
    commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
 | 
					 | 
				
			||||||
    workflows = config.get("workflows", {})
 | 
					 | 
				
			||||||
    validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    req_path = project_dir / "requirements.txt"
 | 
					 | 
				
			||||||
    if not skip_requirements_check:
 | 
					 | 
				
			||||||
        if config.get("check_requirements", True) and os.path.exists(req_path):
 | 
					 | 
				
			||||||
            with req_path.open() as requirements_file:
 | 
					 | 
				
			||||||
                _check_requirements([req.strip() for req in requirements_file])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if subcommand in workflows:
 | 
					 | 
				
			||||||
        msg.info(f"Running workflow '{subcommand}'")
 | 
					 | 
				
			||||||
        for cmd in workflows[subcommand]:
 | 
					 | 
				
			||||||
            project_run(
 | 
					 | 
				
			||||||
                project_dir,
 | 
					 | 
				
			||||||
                cmd,
 | 
					 | 
				
			||||||
                overrides=overrides,
 | 
					 | 
				
			||||||
                force=force,
 | 
					 | 
				
			||||||
                dry=dry,
 | 
					 | 
				
			||||||
                capture=capture,
 | 
					 | 
				
			||||||
                skip_requirements_check=True,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        cmd = commands[subcommand]
 | 
					 | 
				
			||||||
        for dep in cmd.get("deps", []):
 | 
					 | 
				
			||||||
            if not (project_dir / dep).exists():
 | 
					 | 
				
			||||||
                err = f"Missing dependency specified by command '{subcommand}': {dep}"
 | 
					 | 
				
			||||||
                err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
 | 
					 | 
				
			||||||
                err_exits = 1 if not dry else None
 | 
					 | 
				
			||||||
                msg.fail(err, err_help, exits=err_exits)
 | 
					 | 
				
			||||||
        check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
 | 
					 | 
				
			||||||
        with working_dir(project_dir) as current_dir:
 | 
					 | 
				
			||||||
            msg.divider(subcommand)
 | 
					 | 
				
			||||||
            rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
 | 
					 | 
				
			||||||
            if not rerun and not force:
 | 
					 | 
				
			||||||
                msg.info(f"Skipping '{cmd['name']}': nothing changed")
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                run_commands(cmd["script"], dry=dry, capture=capture)
 | 
					 | 
				
			||||||
                if not dry:
 | 
					 | 
				
			||||||
                    update_lockfile(current_dir, cmd)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
 | 
					 | 
				
			||||||
    """Simulate a CLI help prompt using the info available in the project.yml.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): The project directory.
 | 
					 | 
				
			||||||
    subcommand (Optional[str]): The subcommand or None. If a subcommand is
 | 
					 | 
				
			||||||
        provided, the subcommand help is shown. Otherwise, the top-level help
 | 
					 | 
				
			||||||
        and a list of available commands is printed.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    config = load_project_config(project_dir)
 | 
					 | 
				
			||||||
    config_commands = config.get("commands", [])
 | 
					 | 
				
			||||||
    commands = {cmd["name"]: cmd for cmd in config_commands}
 | 
					 | 
				
			||||||
    workflows = config.get("workflows", {})
 | 
					 | 
				
			||||||
    project_loc = "" if is_cwd(project_dir) else project_dir
 | 
					 | 
				
			||||||
    if subcommand:
 | 
					 | 
				
			||||||
        validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
 | 
					 | 
				
			||||||
        print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
 | 
					 | 
				
			||||||
        if subcommand in commands:
 | 
					 | 
				
			||||||
            help_text = commands[subcommand].get("help")
 | 
					 | 
				
			||||||
            if help_text:
 | 
					 | 
				
			||||||
                print(f"\n{help_text}\n")
 | 
					 | 
				
			||||||
        elif subcommand in workflows:
 | 
					 | 
				
			||||||
            steps = workflows[subcommand]
 | 
					 | 
				
			||||||
            print(f"\nWorkflow consisting of {len(steps)} commands:")
 | 
					 | 
				
			||||||
            steps_data = [
 | 
					 | 
				
			||||||
                (f"{i + 1}. {step}", commands[step].get("help", ""))
 | 
					 | 
				
			||||||
                for i, step in enumerate(steps)
 | 
					 | 
				
			||||||
            ]
 | 
					 | 
				
			||||||
            msg.table(steps_data)
 | 
					 | 
				
			||||||
            help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
 | 
					 | 
				
			||||||
            print(f"For command details, run: {help_cmd}")
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        print("")
 | 
					 | 
				
			||||||
        title = config.get("title")
 | 
					 | 
				
			||||||
        if title:
 | 
					 | 
				
			||||||
            print(f"{locale_escape(title)}\n")
 | 
					 | 
				
			||||||
        if config_commands:
 | 
					 | 
				
			||||||
            print(f"Available commands in {PROJECT_FILE}")
 | 
					 | 
				
			||||||
            print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
 | 
					 | 
				
			||||||
            msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
 | 
					 | 
				
			||||||
        if workflows:
 | 
					 | 
				
			||||||
            print(f"Available workflows in {PROJECT_FILE}")
 | 
					 | 
				
			||||||
            print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
 | 
					 | 
				
			||||||
            msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def run_commands(
 | 
					 | 
				
			||||||
    commands: Iterable[str] = SimpleFrozenList(),
 | 
					 | 
				
			||||||
    silent: bool = False,
 | 
					 | 
				
			||||||
    dry: bool = False,
 | 
					 | 
				
			||||||
    capture: bool = False,
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Run a sequence of commands in a subprocess, in order.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    commands (List[str]): The string commands.
 | 
					 | 
				
			||||||
    silent (bool): Don't print the commands.
 | 
					 | 
				
			||||||
    dry (bool): Perform a dry run and don't execut anything.
 | 
					 | 
				
			||||||
    capture (bool): Whether to capture the output and errors of individual commands.
 | 
					 | 
				
			||||||
        If False, the stdout and stderr will not be redirected, and if there's an error,
 | 
					 | 
				
			||||||
        sys.exit will be called with the return code. You should use capture=False
 | 
					 | 
				
			||||||
        when you want to turn over execution to the command, and capture=True
 | 
					 | 
				
			||||||
        when you want to run the command more like a function.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    for c in commands:
 | 
					 | 
				
			||||||
        command = split_command(c)
 | 
					 | 
				
			||||||
        # Not sure if this is needed or a good idea. Motivation: users may often
 | 
					 | 
				
			||||||
        # use commands in their config that reference "python" and we want to
 | 
					 | 
				
			||||||
        # make sure that it's always executing the same Python that spaCy is
 | 
					 | 
				
			||||||
        # executed with and the pip in the same env, not some other Python/pip.
 | 
					 | 
				
			||||||
        # Also ensures cross-compatibility if user 1 writes "python3" (because
 | 
					 | 
				
			||||||
        # that's how it's set up on their system), and user 2 without the
 | 
					 | 
				
			||||||
        # shortcut tries to re-run the command.
 | 
					 | 
				
			||||||
        if len(command) and command[0] in ("python", "python3"):
 | 
					 | 
				
			||||||
            command[0] = sys.executable
 | 
					 | 
				
			||||||
        elif len(command) and command[0] in ("pip", "pip3"):
 | 
					 | 
				
			||||||
            command = [sys.executable, "-m", "pip", *command[1:]]
 | 
					 | 
				
			||||||
        if not silent:
 | 
					 | 
				
			||||||
            print(f"Running command: {join_command(command)}")
 | 
					 | 
				
			||||||
        if not dry:
 | 
					 | 
				
			||||||
            run_command(command, capture=capture)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def validate_subcommand(
 | 
					 | 
				
			||||||
    commands: Sequence[str], workflows: Sequence[str], subcommand: str
 | 
					 | 
				
			||||||
) -> None:
 | 
					 | 
				
			||||||
    """Check that a subcommand is valid and defined. Raises an error otherwise.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    commands (Sequence[str]): The available commands.
 | 
					 | 
				
			||||||
    subcommand (str): The subcommand.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    if not commands and not workflows:
 | 
					 | 
				
			||||||
        msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
 | 
					 | 
				
			||||||
    if subcommand not in commands and subcommand not in workflows:
 | 
					 | 
				
			||||||
        help_msg = []
 | 
					 | 
				
			||||||
        if subcommand in ["assets", "asset"]:
 | 
					 | 
				
			||||||
            help_msg.append("Did you mean to run: python -m spacy project assets?")
 | 
					 | 
				
			||||||
        if commands:
 | 
					 | 
				
			||||||
            help_msg.append(f"Available commands: {', '.join(commands)}")
 | 
					 | 
				
			||||||
        if workflows:
 | 
					 | 
				
			||||||
            help_msg.append(f"Available workflows: {', '.join(workflows)}")
 | 
					 | 
				
			||||||
        msg.fail(
 | 
					 | 
				
			||||||
            f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
 | 
					 | 
				
			||||||
            ". ".join(help_msg),
 | 
					 | 
				
			||||||
            exits=1,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def check_rerun(
 | 
					 | 
				
			||||||
    project_dir: Path,
 | 
					 | 
				
			||||||
    command: Dict[str, Any],
 | 
					 | 
				
			||||||
    *,
 | 
					 | 
				
			||||||
    check_spacy_version: bool = True,
 | 
					 | 
				
			||||||
    check_spacy_commit: bool = False,
 | 
					 | 
				
			||||||
) -> bool:
 | 
					 | 
				
			||||||
    """Check if a command should be rerun because its settings or inputs/outputs
 | 
					 | 
				
			||||||
    changed.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): The current project directory.
 | 
					 | 
				
			||||||
    command (Dict[str, Any]): The command, as defined in the project.yml.
 | 
					 | 
				
			||||||
    strict_version (bool):
 | 
					 | 
				
			||||||
    RETURNS (bool): Whether to re-run the command.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # Always rerun if no-skip is set
 | 
					 | 
				
			||||||
    if command.get("no_skip", False):
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    lock_path = project_dir / PROJECT_LOCK
 | 
					 | 
				
			||||||
    if not lock_path.exists():  # We don't have a lockfile, run command
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    data = srsly.read_yaml(lock_path)
 | 
					 | 
				
			||||||
    if command["name"] not in data:  # We don't have info about this command
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    entry = data[command["name"]]
 | 
					 | 
				
			||||||
    # Always run commands with no outputs (otherwise they'd always be skipped)
 | 
					 | 
				
			||||||
    if not entry.get("outs", []):
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    # Always rerun if spaCy version or commit hash changed
 | 
					 | 
				
			||||||
    spacy_v = entry.get("spacy_version")
 | 
					 | 
				
			||||||
    commit = entry.get("spacy_git_version")
 | 
					 | 
				
			||||||
    if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
 | 
					 | 
				
			||||||
        info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
 | 
					 | 
				
			||||||
        msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    if check_spacy_commit and commit != GIT_VERSION:
 | 
					 | 
				
			||||||
        info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
 | 
					 | 
				
			||||||
        msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
 | 
					 | 
				
			||||||
        return True
 | 
					 | 
				
			||||||
    # If the entry in the lockfile matches the lockfile entry that would be
 | 
					 | 
				
			||||||
    # generated from the current command, we don't rerun because it means that
 | 
					 | 
				
			||||||
    # all inputs/outputs, hashes and scripts are the same and nothing changed
 | 
					 | 
				
			||||||
    lock_entry = get_lock_entry(project_dir, command)
 | 
					 | 
				
			||||||
    exclude = ["spacy_version", "spacy_git_version"]
 | 
					 | 
				
			||||||
    return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
 | 
					 | 
				
			||||||
    """Update the lockfile after running a command. Will create a lockfile if
 | 
					 | 
				
			||||||
    it doesn't yet exist and will add an entry for the current command, its
 | 
					 | 
				
			||||||
    script and dependencies/outputs.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): The current project directory.
 | 
					 | 
				
			||||||
    command (Dict[str, Any]): The command, as defined in the project.yml.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    lock_path = project_dir / PROJECT_LOCK
 | 
					 | 
				
			||||||
    if not lock_path.exists():
 | 
					 | 
				
			||||||
        srsly.write_yaml(lock_path, {})
 | 
					 | 
				
			||||||
        data = {}
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        data = srsly.read_yaml(lock_path)
 | 
					 | 
				
			||||||
    data[command["name"]] = get_lock_entry(project_dir, command)
 | 
					 | 
				
			||||||
    srsly.write_yaml(lock_path, data)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
 | 
					 | 
				
			||||||
    """Get a lockfile entry for a given command. An entry includes the command,
 | 
					 | 
				
			||||||
    the script (command steps) and a list of dependencies and outputs with
 | 
					 | 
				
			||||||
    their paths and file hashes, if available. The format is based on the
 | 
					 | 
				
			||||||
    dvc.lock files, to keep things consistent.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): The current project directory.
 | 
					 | 
				
			||||||
    command (Dict[str, Any]): The command, as defined in the project.yml.
 | 
					 | 
				
			||||||
    RETURNS (Dict[str, Any]): The lockfile entry.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    deps = get_fileinfo(project_dir, command.get("deps", []))
 | 
					 | 
				
			||||||
    outs = get_fileinfo(project_dir, command.get("outputs", []))
 | 
					 | 
				
			||||||
    outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
 | 
					 | 
				
			||||||
    return {
 | 
					 | 
				
			||||||
        "cmd": f"{COMMAND} run {command['name']}",
 | 
					 | 
				
			||||||
        "script": command["script"],
 | 
					 | 
				
			||||||
        "deps": deps,
 | 
					 | 
				
			||||||
        "outs": [*outs, *outs_nc],
 | 
					 | 
				
			||||||
        "spacy_version": about.__version__,
 | 
					 | 
				
			||||||
        "spacy_git_version": GIT_VERSION,
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
 | 
					 | 
				
			||||||
    """Generate the file information for a list of paths (dependencies, outputs).
 | 
					 | 
				
			||||||
    Includes the file path and the file's checksum.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    project_dir (Path): The current project directory.
 | 
					 | 
				
			||||||
    paths (List[str]): The file paths.
 | 
					 | 
				
			||||||
    RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    data = []
 | 
					 | 
				
			||||||
    for path in paths:
 | 
					 | 
				
			||||||
        file_path = project_dir / path
 | 
					 | 
				
			||||||
        md5 = get_checksum(file_path) if file_path.exists() else None
 | 
					 | 
				
			||||||
        data.append({"path": path, "md5": md5})
 | 
					 | 
				
			||||||
    return data
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
 | 
					 | 
				
			||||||
    """Checks whether requirements are installed and free of version conflicts.
 | 
					 | 
				
			||||||
    requirements (List[str]): List of requirements.
 | 
					 | 
				
			||||||
    RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
 | 
					 | 
				
			||||||
        exist.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    import pkg_resources
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    failed_pkgs_msgs: List[str] = []
 | 
					 | 
				
			||||||
    conflicting_pkgs_msgs: List[str] = []
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for req in requirements:
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            pkg_resources.require(req)
 | 
					 | 
				
			||||||
        except pkg_resources.DistributionNotFound as dnf:
 | 
					 | 
				
			||||||
            failed_pkgs_msgs.append(dnf.report())
 | 
					 | 
				
			||||||
        except pkg_resources.VersionConflict as vc:
 | 
					 | 
				
			||||||
            conflicting_pkgs_msgs.append(vc.report())
 | 
					 | 
				
			||||||
        except Exception:
 | 
					 | 
				
			||||||
            msg.warn(
 | 
					 | 
				
			||||||
                f"Unable to check requirement: {req} "
 | 
					 | 
				
			||||||
                "Checks are currently limited to requirement specifiers "
 | 
					 | 
				
			||||||
                "(PEP 508)"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
 | 
					 | 
				
			||||||
        msg.warn(
 | 
					 | 
				
			||||||
            title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
 | 
					 | 
				
			||||||
            "correctly and you installed all requirements specified in your project's requirements.txt: "
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
 | 
					 | 
				
			||||||
            msg.text(pgk_msg)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
 | 
					 | 
				
			||||||
| 
						 | 
					@ -26,6 +26,9 @@ batch_size = 1000
 | 
				
			||||||
[nlp.tokenizer]
 | 
					[nlp.tokenizer]
 | 
				
			||||||
@tokenizers = "spacy.Tokenizer.v1"
 | 
					@tokenizers = "spacy.Tokenizer.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[nlp.vectors]
 | 
				
			||||||
 | 
					@vectors = "spacy.Vectors.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The pipeline components and their models
 | 
					# The pipeline components and their models
 | 
				
			||||||
[components]
 | 
					[components]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -219,6 +219,7 @@ class Warnings(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
 | 
					    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
 | 
				
			||||||
            "key attribute for vectors, configure it through Vectors(attr=) or "
 | 
					            "key attribute for vectors, configure it through Vectors(attr=) or "
 | 
				
			||||||
            "'spacy init vectors --attr'")
 | 
					            "'spacy init vectors --attr'")
 | 
				
			||||||
 | 
					    W126 = ("These keys are unsupported: {unsupported}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Errors(metaclass=ErrorsWithCodes):
 | 
					class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
| 
						 | 
					@ -553,12 +554,12 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
            "during training, make sure to include it in 'annotating components'")
 | 
					            "during training, make sure to include it in 'annotating components'")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # New errors added in v3.x
 | 
					    # New errors added in v3.x
 | 
				
			||||||
 | 
					    E849 = ("The vocab only supports {method} for vectors of type "
 | 
				
			||||||
 | 
					            "spacy.vectors.Vectors, not {vectors_type}.")
 | 
				
			||||||
    E850 = ("The PretrainVectors objective currently only supports default or "
 | 
					    E850 = ("The PretrainVectors objective currently only supports default or "
 | 
				
			||||||
            "floret vectors, not {mode} vectors.")
 | 
					            "floret vectors, not {mode} vectors.")
 | 
				
			||||||
    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
 | 
					    E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
 | 
				
			||||||
            "but found value of '{val}'.")
 | 
					            "but found value of '{val}'.")
 | 
				
			||||||
    E852 = ("The tar file pulled from the remote attempted an unsafe path "
 | 
					 | 
				
			||||||
            "traversal.")
 | 
					 | 
				
			||||||
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
 | 
					    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
 | 
				
			||||||
            "not permitted in factory names.")
 | 
					            "not permitted in factory names.")
 | 
				
			||||||
    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
 | 
					    E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
 | 
				
			||||||
| 
						 | 
					@ -981,6 +982,8 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
             " 'min_length': {min_length}, 'max_length': {max_length}")
 | 
					             " 'min_length': {min_length}, 'max_length': {max_length}")
 | 
				
			||||||
    E1054 = ("The text, including whitespace, must match between reference and "
 | 
					    E1054 = ("The text, including whitespace, must match between reference and "
 | 
				
			||||||
             "predicted docs when training {component}.")
 | 
					             "predicted docs when training {component}.")
 | 
				
			||||||
 | 
					    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
 | 
				
			||||||
 | 
					             "but only callbacks with one or three parameters are supported")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Deprecated model shortcuts, only used in errors and warnings
 | 
					# Deprecated model shortcuts, only used in errors and warnings
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from typing import Iterable
 | 
					from typing import Iterable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Iterable, Tuple, Union
 | 
					from typing import Iterable, Tuple, Union
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
from typing import Any, Callable, Dict, Iterable
 | 
					from typing import Any, Callable, Dict, Iterable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -163,7 +163,7 @@ class SpanishLemmatizer(Lemmatizer):
 | 
				
			||||||
        for old, new in self.lookups.get_table("lemma_rules").get("det", []):
 | 
					        for old, new in self.lookups.get_table("lemma_rules").get("det", []):
 | 
				
			||||||
            if word == old:
 | 
					            if word == old:
 | 
				
			||||||
                return [new]
 | 
					                return [new]
 | 
				
			||||||
        # If none of the specfic rules apply, search in the common rules for
 | 
					        # If none of the specific rules apply, search in the common rules for
 | 
				
			||||||
        # determiners and pronouns that follow a unique pattern for
 | 
					        # determiners and pronouns that follow a unique pattern for
 | 
				
			||||||
        # lemmatization. If the word is in the list, return the corresponding
 | 
					        # lemmatization. If the word is in the list, return the corresponding
 | 
				
			||||||
        # lemma.
 | 
					        # lemma.
 | 
				
			||||||
| 
						 | 
					@ -291,7 +291,7 @@ class SpanishLemmatizer(Lemmatizer):
 | 
				
			||||||
        for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
 | 
					        for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
 | 
				
			||||||
            if word == old:
 | 
					            if word == old:
 | 
				
			||||||
                return [new]
 | 
					                return [new]
 | 
				
			||||||
        # If none of the specfic rules apply, search in the common rules for
 | 
					        # If none of the specific rules apply, search in the common rules for
 | 
				
			||||||
        # determiners and pronouns that follow a unique pattern for
 | 
					        # determiners and pronouns that follow a unique pattern for
 | 
				
			||||||
        # lemmatization. If the word is in the list, return the corresponding
 | 
					        # lemmatization. If the word is in the list, return the corresponding
 | 
				
			||||||
        # lemma.
 | 
					        # lemma.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@ _prefixes = (
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        "†",
 | 
					        "†",
 | 
				
			||||||
        "⸏",
 | 
					        "⸏",
 | 
				
			||||||
 | 
					        "〈",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    + LIST_PUNCT
 | 
					    + LIST_PUNCT
 | 
				
			||||||
    + LIST_ELLIPSES
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
| 
						 | 
					@ -31,6 +32,7 @@ _suffixes = (
 | 
				
			||||||
    + [
 | 
					    + [
 | 
				
			||||||
        "†",
 | 
					        "†",
 | 
				
			||||||
        "⸎",
 | 
					        "⸎",
 | 
				
			||||||
 | 
					        "〉",
 | 
				
			||||||
        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
 | 
					        r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
 | 
					import inspect
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import multiprocessing as mp
 | 
					import multiprocessing as mp
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
| 
						 | 
					@ -64,6 +65,7 @@ from .util import (
 | 
				
			||||||
    registry,
 | 
					    registry,
 | 
				
			||||||
    warn_if_jupyter_cupy,
 | 
					    warn_if_jupyter_cupy,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					from .vectors import BaseVectors
 | 
				
			||||||
from .vocab import Vocab, create_vocab
 | 
					from .vocab import Vocab, create_vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PipeCallable = Callable[[Doc], Doc]
 | 
					PipeCallable = Callable[[Doc], Doc]
 | 
				
			||||||
| 
						 | 
					@ -157,6 +159,7 @@ class Language:
 | 
				
			||||||
        max_length: int = 10**6,
 | 
					        max_length: int = 10**6,
 | 
				
			||||||
        meta: Dict[str, Any] = {},
 | 
					        meta: Dict[str, Any] = {},
 | 
				
			||||||
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
 | 
					        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
 | 
				
			||||||
 | 
					        create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
 | 
				
			||||||
        batch_size: int = 1000,
 | 
					        batch_size: int = 1000,
 | 
				
			||||||
        **kwargs,
 | 
					        **kwargs,
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
| 
						 | 
					@ -197,6 +200,10 @@ class Language:
 | 
				
			||||||
        if vocab is True:
 | 
					        if vocab is True:
 | 
				
			||||||
            vectors_name = meta.get("vectors", {}).get("name")
 | 
					            vectors_name = meta.get("vectors", {}).get("name")
 | 
				
			||||||
            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
 | 
					            vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
 | 
				
			||||||
 | 
					            if not create_vectors:
 | 
				
			||||||
 | 
					                vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
 | 
				
			||||||
 | 
					                create_vectors = registry.resolve(vectors_cfg)["vectors"]
 | 
				
			||||||
 | 
					            vocab.vectors = create_vectors(vocab)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
 | 
					            if (self.lang and vocab.lang) and (self.lang != vocab.lang):
 | 
				
			||||||
                raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
 | 
					                raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
 | 
				
			||||||
| 
						 | 
					@ -1764,6 +1771,10 @@ class Language:
 | 
				
			||||||
            ).merge(config)
 | 
					            ).merge(config)
 | 
				
			||||||
        if "nlp" not in config:
 | 
					        if "nlp" not in config:
 | 
				
			||||||
            raise ValueError(Errors.E985.format(config=config))
 | 
					            raise ValueError(Errors.E985.format(config=config))
 | 
				
			||||||
 | 
					        # fill in [nlp.vectors] if not present (as a narrower alternative to
 | 
				
			||||||
 | 
					        # auto-filling [nlp] from the default config)
 | 
				
			||||||
 | 
					        if "vectors" not in config["nlp"]:
 | 
				
			||||||
 | 
					            config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
 | 
				
			||||||
        config_lang = config["nlp"].get("lang")
 | 
					        config_lang = config["nlp"].get("lang")
 | 
				
			||||||
        if config_lang is not None and config_lang != cls.lang:
 | 
					        if config_lang is not None and config_lang != cls.lang:
 | 
				
			||||||
            raise ValueError(
 | 
					            raise ValueError(
 | 
				
			||||||
| 
						 | 
					@ -1795,6 +1806,7 @@ class Language:
 | 
				
			||||||
            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
 | 
					            filled["nlp"], validate=validate, schema=ConfigSchemaNlp
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        create_tokenizer = resolved_nlp["tokenizer"]
 | 
					        create_tokenizer = resolved_nlp["tokenizer"]
 | 
				
			||||||
 | 
					        create_vectors = resolved_nlp["vectors"]
 | 
				
			||||||
        before_creation = resolved_nlp["before_creation"]
 | 
					        before_creation = resolved_nlp["before_creation"]
 | 
				
			||||||
        after_creation = resolved_nlp["after_creation"]
 | 
					        after_creation = resolved_nlp["after_creation"]
 | 
				
			||||||
        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
 | 
					        after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
 | 
				
			||||||
| 
						 | 
					@ -1815,7 +1827,12 @@ class Language:
 | 
				
			||||||
        # inside stuff like the spacy train function. If we loaded them here,
 | 
					        # inside stuff like the spacy train function. If we loaded them here,
 | 
				
			||||||
        # then we would load them twice at runtime: once when we make from config,
 | 
					        # then we would load them twice at runtime: once when we make from config,
 | 
				
			||||||
        # and then again when we load from disk.
 | 
					        # and then again when we load from disk.
 | 
				
			||||||
        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
 | 
					        nlp = lang_cls(
 | 
				
			||||||
 | 
					            vocab=vocab,
 | 
				
			||||||
 | 
					            create_tokenizer=create_tokenizer,
 | 
				
			||||||
 | 
					            create_vectors=create_vectors,
 | 
				
			||||||
 | 
					            meta=meta,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        if after_creation is not None:
 | 
					        if after_creation is not None:
 | 
				
			||||||
            nlp = after_creation(nlp)
 | 
					            nlp = after_creation(nlp)
 | 
				
			||||||
            if not isinstance(nlp, cls):
 | 
					            if not isinstance(nlp, cls):
 | 
				
			||||||
| 
						 | 
					@ -2032,8 +2049,20 @@ class Language:
 | 
				
			||||||
            # Go over the listener layers and replace them
 | 
					            # Go over the listener layers and replace them
 | 
				
			||||||
            for listener in pipe_listeners:
 | 
					            for listener in pipe_listeners:
 | 
				
			||||||
                new_model = tok2vec_model.copy()
 | 
					                new_model = tok2vec_model.copy()
 | 
				
			||||||
                if "replace_listener" in tok2vec_model.attrs:
 | 
					                replace_listener_func = tok2vec_model.attrs.get("replace_listener")
 | 
				
			||||||
                    new_model = tok2vec_model.attrs["replace_listener"](new_model)
 | 
					                if replace_listener_func is not None:
 | 
				
			||||||
 | 
					                    # Pass the extra args to the callback without breaking compatibility with
 | 
				
			||||||
 | 
					                    # old library versions that only expect a single parameter.
 | 
				
			||||||
 | 
					                    num_params = len(
 | 
				
			||||||
 | 
					                        inspect.signature(replace_listener_func).parameters
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					                    if num_params == 1:
 | 
				
			||||||
 | 
					                        new_model = replace_listener_func(new_model)
 | 
				
			||||||
 | 
					                    elif num_params == 3:
 | 
				
			||||||
 | 
					                        new_model = replace_listener_func(new_model, listener, tok2vec)
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        raise ValueError(Errors.E1055.format(num_params=num_params))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
 | 
					                util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined]
 | 
				
			||||||
                tok2vec.remove_listener(listener, pipe_name)
 | 
					                tok2vec.remove_listener(listener, pipe_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: embedsignature=True
 | 
					# cython: embedsignature=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
# Compiler crashes on memory view coercion without this. Should report bug.
 | 
					# Compiler crashes on memory view coercion without this. Should report bug.
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from itertools import product
 | 
					from itertools import product
 | 
				
			||||||
| 
						 | 
					@ -129,6 +129,7 @@ cdef class DependencyMatcher:
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
 | 
					                required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
 | 
				
			||||||
                relation_keys = set(relation.keys())
 | 
					                relation_keys = set(relation.keys())
 | 
				
			||||||
 | 
					                # Identify required keys that have not been specified
 | 
				
			||||||
                missing = required_keys - relation_keys
 | 
					                missing = required_keys - relation_keys
 | 
				
			||||||
                if missing:
 | 
					                if missing:
 | 
				
			||||||
                    missing_txt = ", ".join(list(missing))
 | 
					                    missing_txt = ", ".join(list(missing))
 | 
				
			||||||
| 
						 | 
					@ -136,6 +137,13 @@ cdef class DependencyMatcher:
 | 
				
			||||||
                        required=required_keys,
 | 
					                        required=required_keys,
 | 
				
			||||||
                        missing=missing_txt
 | 
					                        missing=missing_txt
 | 
				
			||||||
                    ))
 | 
					                    ))
 | 
				
			||||||
 | 
					                # Identify additional, unsupported keys
 | 
				
			||||||
 | 
					                unsupported = relation_keys - required_keys
 | 
				
			||||||
 | 
					                if unsupported:
 | 
				
			||||||
 | 
					                    unsupported_txt = ", ".join(list(unsupported))
 | 
				
			||||||
 | 
					                    warnings.warn(Warnings.W126.format(
 | 
				
			||||||
 | 
					                        unsupported=unsupported_txt
 | 
				
			||||||
 | 
					                    ))
 | 
				
			||||||
                if (
 | 
					                if (
 | 
				
			||||||
                    relation["RIGHT_ID"] in visited_nodes
 | 
					                    relation["RIGHT_ID"] in visited_nodes
 | 
				
			||||||
                    or relation["LEFT_ID"] not in visited_nodes
 | 
					                    or relation["LEFT_ID"] not in visited_nodes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: profile=True, binding=True, infer_types=True
 | 
					# cython: binding=True, infer_types=True
 | 
				
			||||||
from cpython.object cimport PyObject
 | 
					from cpython.object cimport PyObject
 | 
				
			||||||
from libc.stdint cimport int64_t
 | 
					from libc.stdint cimport int64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: binding=True, infer_types=True, profile=True
 | 
					# cython: binding=True, infer_types=True
 | 
				
			||||||
from typing import Iterable, List
 | 
					from typing import Iterable, List
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 | 
					from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
 | 
					# cython: infer_types=True, cdivision=True, boundscheck=False
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from libc.math cimport exp
 | 
					from libc.math cimport exp
 | 
				
			||||||
from libc.stdlib cimport calloc, free, realloc
 | 
					from libc.stdlib cimport calloc, free, realloc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ from thinc.util import partial
 | 
				
			||||||
from ..attrs import ORTH
 | 
					from ..attrs import ORTH
 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ..vectors import Mode
 | 
					from ..vectors import Mode, Vectors
 | 
				
			||||||
from ..vocab import Vocab
 | 
					from ..vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -48,11 +48,14 @@ def forward(
 | 
				
			||||||
    key_attr: int = getattr(vocab.vectors, "attr", ORTH)
 | 
					    key_attr: int = getattr(vocab.vectors, "attr", ORTH)
 | 
				
			||||||
    keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
 | 
					    keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
 | 
				
			||||||
    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
 | 
					    W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
 | 
				
			||||||
    if vocab.vectors.mode == Mode.default:
 | 
					    if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
 | 
				
			||||||
        V = model.ops.asarray(vocab.vectors.data)
 | 
					        V = model.ops.asarray(vocab.vectors.data)
 | 
				
			||||||
        rows = vocab.vectors.find(keys=keys)
 | 
					        rows = vocab.vectors.find(keys=keys)
 | 
				
			||||||
        V = model.ops.as_contig(V[rows])
 | 
					        V = model.ops.as_contig(V[rows])
 | 
				
			||||||
    elif vocab.vectors.mode == Mode.floret:
 | 
					    elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret:
 | 
				
			||||||
 | 
					        V = vocab.vectors.get_batch(keys)
 | 
				
			||||||
 | 
					        V = model.ops.as_contig(V)
 | 
				
			||||||
 | 
					    elif hasattr(vocab.vectors, "get_batch"):
 | 
				
			||||||
        V = vocab.vectors.get_batch(keys)
 | 
					        V = vocab.vectors.get_batch(keys)
 | 
				
			||||||
        V = model.ops.as_contig(V)
 | 
					        V = model.ops.as_contig(V)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					@ -61,7 +64,7 @@ def forward(
 | 
				
			||||||
        vectors_data = model.ops.gemm(V, W, trans2=True)
 | 
					        vectors_data = model.ops.gemm(V, W, trans2=True)
 | 
				
			||||||
    except ValueError:
 | 
					    except ValueError:
 | 
				
			||||||
        raise RuntimeError(Errors.E896)
 | 
					        raise RuntimeError(Errors.E896)
 | 
				
			||||||
    if vocab.vectors.mode == Mode.default:
 | 
					    if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
 | 
				
			||||||
        # Convert negative indices to 0-vectors
 | 
					        # Convert negative indices to 0-vectors
 | 
				
			||||||
        # TODO: more options for UNK tokens
 | 
					        # TODO: more options for UNK tokens
 | 
				
			||||||
        vectors_data[rows < 0] = 0
 | 
					        vectors_data[rows < 0] = 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types
 | 
					# cython: infer_types
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
IDS = {
 | 
					IDS = {
 | 
				
			||||||
    "": NO_TAG,
 | 
					    "": NO_TAG,
 | 
				
			||||||
    "ADJ": ADJ,
 | 
					    "ADJ": ADJ,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from cython.operator cimport dereference as deref
 | 
					from cython.operator cimport dereference as deref
 | 
				
			||||||
from libc.stdint cimport UINT32_MAX, uint32_t
 | 
					from libc.stdint cimport UINT32_MAX, uint32_t
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.extra.search cimport Beam
 | 
					from thinc.extra.search cimport Beam
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: profile=True, cdivision=True, infer_types=True
 | 
					# cython: cdivision=True, infer_types=True
 | 
				
			||||||
from cymem.cymem cimport Address, Pool
 | 
					from cymem.cymem cimport Address, Pool
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from libc.stdint cimport int32_t
 | 
					from libc.stdint cimport int32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: profile=True, infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 | 
					"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 | 
				
			||||||
for doing pseudo-projective parsing implementation uses the HEAD decoration
 | 
					for doing pseudo-projective parsing implementation uses the HEAD decoration
 | 
				
			||||||
scheme.
 | 
					scheme.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...tokens.doc cimport Doc
 | 
					from ...tokens.doc cimport Doc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from __future__ import print_function
 | 
					from __future__ import print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from typing import Callable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
from typing import Callable, Dict, Optional, Union
 | 
					from typing import Callable, Dict, Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from typing import Optional
 | 
					from typing import Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from typing import Callable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 | 
					from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from typing import Callable, List, Optional
 | 
					from typing import Callable, List, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
from typing import Callable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from itertools import islice
 | 
					from itertools import islice
 | 
				
			||||||
from typing import Callable, Optional
 | 
					from typing import Callable, Optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, profile=True, binding=True
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 | 
					from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 | 
					# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from __future__ import print_function
 | 
					from __future__ import print_function
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -412,6 +412,7 @@ class ConfigSchemaNlp(BaseModel):
 | 
				
			||||||
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
 | 
					    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
 | 
				
			||||||
    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
 | 
					    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
 | 
				
			||||||
    batch_size: Optional[int] = Field(..., title="Default batch size")
 | 
					    batch_size: Optional[int] = Field(..., title="Default batch size")
 | 
				
			||||||
 | 
					    vectors: Callable = Field(..., title="Vectors implementation")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    class Config:
 | 
					    class Config:
 | 
				
			||||||
| 
						 | 
					@ -480,66 +481,6 @@ CONFIG_SCHEMAS = {
 | 
				
			||||||
    "initialize": ConfigSchemaInit,
 | 
					    "initialize": ConfigSchemaInit,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Project config Schema
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class ProjectConfigAssetGitItem(BaseModel):
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    repo: StrictStr = Field(..., title="URL of Git repo to download from")
 | 
					 | 
				
			||||||
    path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
 | 
					 | 
				
			||||||
    branch: StrictStr = Field("master", title="Branch to clone from")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class ProjectConfigAssetURL(BaseModel):
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
 | 
					 | 
				
			||||||
    url: Optional[StrictStr] = Field(None, title="URL of asset")
 | 
					 | 
				
			||||||
    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
 | 
					 | 
				
			||||||
    description: StrictStr = Field("", title="Description of asset")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class ProjectConfigAssetGit(BaseModel):
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
 | 
					 | 
				
			||||||
    checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
 | 
					 | 
				
			||||||
    description: Optional[StrictStr] = Field(None, title="Description of asset")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class ProjectConfigCommand(BaseModel):
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    name: StrictStr = Field(..., title="Name of command")
 | 
					 | 
				
			||||||
    help: Optional[StrictStr] = Field(None, title="Command description")
 | 
					 | 
				
			||||||
    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
 | 
					 | 
				
			||||||
    deps: List[StrictStr] = Field([], title="File dependencies required by this command")
 | 
					 | 
				
			||||||
    outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
 | 
					 | 
				
			||||||
    outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
 | 
					 | 
				
			||||||
    no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    class Config:
 | 
					 | 
				
			||||||
        title = "A single named command specified in a project config"
 | 
					 | 
				
			||||||
        extra = "forbid"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class ProjectConfigSchema(BaseModel):
 | 
					 | 
				
			||||||
    # fmt: off
 | 
					 | 
				
			||||||
    vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
 | 
					 | 
				
			||||||
    env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
 | 
					 | 
				
			||||||
    assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
 | 
					 | 
				
			||||||
    workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
 | 
					 | 
				
			||||||
    commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
 | 
					 | 
				
			||||||
    title: Optional[str] = Field(None, title="Project title")
 | 
					 | 
				
			||||||
    spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    class Config:
 | 
					 | 
				
			||||||
        title = "Schema for project configuration file"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Recommendations for init config workflows
 | 
					# Recommendations for init config workflows
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
from libc.string cimport memcpy
 | 
					from libc.string cimport memcpy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: optimize.unpack_method_calls=False
 | 
					# cython: optimize.unpack_method_calls=False
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
IDS = {
 | 
					IDS = {
 | 
				
			||||||
    "": NIL,
 | 
					    "": NIL,
 | 
				
			||||||
    "IS_ALPHA": IS_ALPHA,
 | 
					    "IS_ALPHA": IS_ALPHA,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -216,6 +216,11 @@ def test_dependency_matcher_pattern_validation(en_vocab):
 | 
				
			||||||
        pattern2 = copy.deepcopy(pattern)
 | 
					        pattern2 = copy.deepcopy(pattern)
 | 
				
			||||||
        pattern2[1]["RIGHT_ID"] = "fox"
 | 
					        pattern2[1]["RIGHT_ID"] = "fox"
 | 
				
			||||||
        matcher.add("FOUNDED", [pattern2])
 | 
					        matcher.add("FOUNDED", [pattern2])
 | 
				
			||||||
 | 
					    # invalid key
 | 
				
			||||||
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
 | 
					        pattern2 = copy.deepcopy(pattern)
 | 
				
			||||||
 | 
					        pattern2[1]["FOO"] = "BAR"
 | 
				
			||||||
 | 
					        matcher.add("FOUNDED", [pattern2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_dependency_matcher_callback(en_vocab, doc):
 | 
					def test_dependency_matcher_callback(en_vocab, doc):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,8 +4,8 @@ from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_build_dependencies():
 | 
					def test_build_dependencies():
 | 
				
			||||||
    # Check that library requirements are pinned exactly the same across different setup files.
 | 
					    # Check that library requirements are pinned exactly the same across different setup files.
 | 
				
			||||||
    # TODO: correct checks for numpy rather than ignoring
 | 
					 | 
				
			||||||
    libs_ignore_requirements = [
 | 
					    libs_ignore_requirements = [
 | 
				
			||||||
 | 
					        "numpy",
 | 
				
			||||||
        "pytest",
 | 
					        "pytest",
 | 
				
			||||||
        "pytest-timeout",
 | 
					        "pytest-timeout",
 | 
				
			||||||
        "mock",
 | 
					        "mock",
 | 
				
			||||||
| 
						 | 
					@ -23,6 +23,7 @@ def test_build_dependencies():
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    # ignore language-specific packages that shouldn't be installed by all
 | 
					    # ignore language-specific packages that shouldn't be installed by all
 | 
				
			||||||
    libs_ignore_setup = [
 | 
					    libs_ignore_setup = [
 | 
				
			||||||
 | 
					        "numpy",
 | 
				
			||||||
        "fugashi",
 | 
					        "fugashi",
 | 
				
			||||||
        "natto-py",
 | 
					        "natto-py",
 | 
				
			||||||
        "pythainlp",
 | 
					        "pythainlp",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,31 +1,19 @@
 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
from collections import Counter
 | 
					from collections import Counter
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Any, Dict, List, Tuple
 | 
					from typing import Any, Dict, List, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from click import NoSuchOption
 | 
					from click import NoSuchOption
 | 
				
			||||||
from packaging.specifiers import SpecifierSet
 | 
					from packaging.specifiers import SpecifierSet
 | 
				
			||||||
from thinc.api import Config, ConfigValidationError
 | 
					from thinc.api import Config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
from spacy import about
 | 
					from spacy import about
 | 
				
			||||||
from spacy.cli import info
 | 
					from spacy.cli import info
 | 
				
			||||||
from spacy.cli._util import (
 | 
					from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
 | 
				
			||||||
    download_file,
 | 
					 | 
				
			||||||
    is_subpath_of,
 | 
					 | 
				
			||||||
    load_project_config,
 | 
					 | 
				
			||||||
    parse_config_overrides,
 | 
					 | 
				
			||||||
    string_to_list,
 | 
					 | 
				
			||||||
    substitute_project_variables,
 | 
					 | 
				
			||||||
    upload_file,
 | 
					 | 
				
			||||||
    validate_project_commands,
 | 
					 | 
				
			||||||
    walk_directory,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from spacy.cli.apply import apply
 | 
					from spacy.cli.apply import apply
 | 
				
			||||||
from spacy.cli.debug_data import (
 | 
					from spacy.cli.debug_data import (
 | 
				
			||||||
    _compile_gold,
 | 
					    _compile_gold,
 | 
				
			||||||
| 
						 | 
					@ -43,13 +31,11 @@ from spacy.cli.find_threshold import find_threshold
 | 
				
			||||||
from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
 | 
					from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
 | 
				
			||||||
from spacy.cli.init_pipeline import _init_labels
 | 
					from spacy.cli.init_pipeline import _init_labels
 | 
				
			||||||
from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
 | 
					from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
 | 
				
			||||||
from spacy.cli.project.remote_storage import RemoteStorage
 | 
					 | 
				
			||||||
from spacy.cli.project.run import _check_requirements
 | 
					 | 
				
			||||||
from spacy.cli.validate import get_model_pkgs
 | 
					from spacy.cli.validate import get_model_pkgs
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.lang.nl import Dutch
 | 
					from spacy.lang.nl import Dutch
 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 | 
					from spacy.schemas import RecommendationSchema
 | 
				
			||||||
from spacy.tokens import Doc, DocBin
 | 
					from spacy.tokens import Doc, DocBin
 | 
				
			||||||
from spacy.tokens.span import Span
 | 
					from spacy.tokens.span import Span
 | 
				
			||||||
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
 | 
					from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
 | 
				
			||||||
| 
						 | 
					@ -134,25 +120,6 @@ def test_issue7055():
 | 
				
			||||||
    assert "model" in filled_cfg["components"]["ner"]
 | 
					    assert "model" in filled_cfg["components"]["ner"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.issue(11235)
 | 
					 | 
				
			||||||
def test_issue11235():
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Test that the cli handles interpolation in the directory names correctly when loading project config.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    lang_var = "en"
 | 
					 | 
				
			||||||
    variables = {"lang": lang_var}
 | 
					 | 
				
			||||||
    commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
 | 
					 | 
				
			||||||
    directories = ["cfg", "${vars.lang}_model"]
 | 
					 | 
				
			||||||
    project = {"commands": commands, "vars": variables, "directories": directories}
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        srsly.write_yaml(d / "project.yml", project)
 | 
					 | 
				
			||||||
        cfg = load_project_config(d)
 | 
					 | 
				
			||||||
        # Check that the directories are interpolated and created correctly
 | 
					 | 
				
			||||||
        assert os.path.exists(d / "cfg")
 | 
					 | 
				
			||||||
        assert os.path.exists(d / f"{lang_var}_model")
 | 
					 | 
				
			||||||
    assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.issue(12566)
 | 
					@pytest.mark.issue(12566)
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "factory,output_file",
 | 
					    "factory,output_file",
 | 
				
			||||||
| 
						 | 
					@ -443,136 +410,6 @@ def test_cli_converters_conll_ner_to_docs():
 | 
				
			||||||
        assert ent.text in ["New York City", "London"]
 | 
					        assert ent.text in ["New York City", "London"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_project_config_validation_full():
 | 
					 | 
				
			||||||
    config = {
 | 
					 | 
				
			||||||
        "vars": {"some_var": 20},
 | 
					 | 
				
			||||||
        "directories": ["assets", "configs", "corpus", "scripts", "training"],
 | 
					 | 
				
			||||||
        "assets": [
 | 
					 | 
				
			||||||
            {
 | 
					 | 
				
			||||||
                "dest": "x",
 | 
					 | 
				
			||||||
                "extra": True,
 | 
					 | 
				
			||||||
                "url": "https://example.com",
 | 
					 | 
				
			||||||
                "checksum": "63373dd656daa1fd3043ce166a59474c",
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            {
 | 
					 | 
				
			||||||
                "dest": "y",
 | 
					 | 
				
			||||||
                "git": {
 | 
					 | 
				
			||||||
                    "repo": "https://github.com/example/repo",
 | 
					 | 
				
			||||||
                    "branch": "develop",
 | 
					 | 
				
			||||||
                    "path": "y",
 | 
					 | 
				
			||||||
                },
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            {
 | 
					 | 
				
			||||||
                "dest": "z",
 | 
					 | 
				
			||||||
                "extra": False,
 | 
					 | 
				
			||||||
                "url": "https://example.com",
 | 
					 | 
				
			||||||
                "checksum": "63373dd656daa1fd3043ce166a59474c",
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
        "commands": [
 | 
					 | 
				
			||||||
            {
 | 
					 | 
				
			||||||
                "name": "train",
 | 
					 | 
				
			||||||
                "help": "Train a model",
 | 
					 | 
				
			||||||
                "script": ["python -m spacy train config.cfg -o training"],
 | 
					 | 
				
			||||||
                "deps": ["config.cfg", "corpus/training.spcy"],
 | 
					 | 
				
			||||||
                "outputs": ["training/model-best"],
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
        "workflows": {"all": ["train", "test"], "train": ["train"]},
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    errors = validate(ProjectConfigSchema, config)
 | 
					 | 
				
			||||||
    assert not errors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					 | 
				
			||||||
    "config",
 | 
					 | 
				
			||||||
    [
 | 
					 | 
				
			||||||
        {"commands": [{"name": "a"}, {"name": "a"}]},
 | 
					 | 
				
			||||||
        {"commands": [{"name": "a"}], "workflows": {"a": []}},
 | 
					 | 
				
			||||||
        {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_project_config_validation1(config):
 | 
					 | 
				
			||||||
    with pytest.raises(SystemExit):
 | 
					 | 
				
			||||||
        validate_project_commands(config)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					 | 
				
			||||||
    "config,n_errors",
 | 
					 | 
				
			||||||
    [
 | 
					 | 
				
			||||||
        ({"commands": {"a": []}}, 1),
 | 
					 | 
				
			||||||
        ({"commands": [{"help": "..."}]}, 1),
 | 
					 | 
				
			||||||
        ({"commands": [{"name": "a", "extra": "b"}]}, 1),
 | 
					 | 
				
			||||||
        ({"commands": [{"extra": "b"}]}, 2),
 | 
					 | 
				
			||||||
        ({"commands": [{"name": "a", "deps": [123]}]}, 1),
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_project_config_validation2(config, n_errors):
 | 
					 | 
				
			||||||
    errors = validate(ProjectConfigSchema, config)
 | 
					 | 
				
			||||||
    assert len(errors) == n_errors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					 | 
				
			||||||
    "int_value",
 | 
					 | 
				
			||||||
    [10, pytest.param("10", marks=pytest.mark.xfail)],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_project_config_interpolation(int_value):
 | 
					 | 
				
			||||||
    variables = {"a": int_value, "b": {"c": "foo", "d": True}}
 | 
					 | 
				
			||||||
    commands = [
 | 
					 | 
				
			||||||
        {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
 | 
					 | 
				
			||||||
        {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    project = {"commands": commands, "vars": variables}
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        srsly.write_yaml(d / "project.yml", project)
 | 
					 | 
				
			||||||
        cfg = load_project_config(d)
 | 
					 | 
				
			||||||
    assert type(cfg) == dict
 | 
					 | 
				
			||||||
    assert type(cfg["commands"]) == list
 | 
					 | 
				
			||||||
    assert cfg["commands"][0]["script"][0] == "hello 10 foo"
 | 
					 | 
				
			||||||
    assert cfg["commands"][1]["script"][0] == "foo true"
 | 
					 | 
				
			||||||
    commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
 | 
					 | 
				
			||||||
    project = {"commands": commands, "vars": variables}
 | 
					 | 
				
			||||||
    with pytest.raises(ConfigValidationError):
 | 
					 | 
				
			||||||
        substitute_project_variables(project)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					 | 
				
			||||||
    "greeting",
 | 
					 | 
				
			||||||
    [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_project_config_interpolation_override(greeting):
 | 
					 | 
				
			||||||
    variables = {"a": "world"}
 | 
					 | 
				
			||||||
    commands = [
 | 
					 | 
				
			||||||
        {"name": "x", "script": ["hello ${vars.a}"]},
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    overrides = {"vars.a": greeting}
 | 
					 | 
				
			||||||
    project = {"commands": commands, "vars": variables}
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        srsly.write_yaml(d / "project.yml", project)
 | 
					 | 
				
			||||||
        cfg = load_project_config(d, overrides=overrides)
 | 
					 | 
				
			||||||
    assert type(cfg) == dict
 | 
					 | 
				
			||||||
    assert type(cfg["commands"]) == list
 | 
					 | 
				
			||||||
    assert cfg["commands"][0]["script"][0] == f"hello {greeting}"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_project_config_interpolation_env():
 | 
					 | 
				
			||||||
    variables = {"a": 10}
 | 
					 | 
				
			||||||
    env_var = "SPACY_TEST_FOO"
 | 
					 | 
				
			||||||
    env_vars = {"foo": env_var}
 | 
					 | 
				
			||||||
    commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
 | 
					 | 
				
			||||||
    project = {"commands": commands, "vars": variables, "env": env_vars}
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        srsly.write_yaml(d / "project.yml", project)
 | 
					 | 
				
			||||||
        cfg = load_project_config(d)
 | 
					 | 
				
			||||||
    assert cfg["commands"][0]["script"][0] == "hello 10 "
 | 
					 | 
				
			||||||
    os.environ[env_var] = "123"
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        srsly.write_yaml(d / "project.yml", project)
 | 
					 | 
				
			||||||
        cfg = load_project_config(d)
 | 
					 | 
				
			||||||
    assert cfg["commands"][0]["script"][0] == "hello 10 123"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "args,expected",
 | 
					    "args,expected",
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
| 
						 | 
					@ -782,21 +619,6 @@ def test_get_third_party_dependencies():
 | 
				
			||||||
    get_third_party_dependencies(nlp.config)
 | 
					    get_third_party_dependencies(nlp.config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					 | 
				
			||||||
    "parent,child,expected",
 | 
					 | 
				
			||||||
    [
 | 
					 | 
				
			||||||
        ("/tmp", "/tmp", True),
 | 
					 | 
				
			||||||
        ("/tmp", "/", False),
 | 
					 | 
				
			||||||
        ("/tmp", "/tmp/subdir", True),
 | 
					 | 
				
			||||||
        ("/tmp", "/tmpdir", False),
 | 
					 | 
				
			||||||
        ("/tmp", "/tmp/subdir/..", True),
 | 
					 | 
				
			||||||
        ("/tmp", "/tmp/..", False),
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_is_subpath_of(parent, child, expected):
 | 
					 | 
				
			||||||
    assert is_subpath_of(parent, child) == expected
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.slow
 | 
					@pytest.mark.slow
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "factory_name,pipe_name",
 | 
					    "factory_name,pipe_name",
 | 
				
			||||||
| 
						 | 
					@ -1042,60 +864,6 @@ def test_applycli_user_data():
 | 
				
			||||||
        assert result[0]._.ext == val
 | 
					        assert result[0]._.ext == val
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_local_remote_storage():
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        filename = "a.txt"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        content_hashes = ("aaaa", "cccc", "bbbb")
 | 
					 | 
				
			||||||
        for i, content_hash in enumerate(content_hashes):
 | 
					 | 
				
			||||||
            # make sure that each subsequent file has a later timestamp
 | 
					 | 
				
			||||||
            if i > 0:
 | 
					 | 
				
			||||||
                time.sleep(1)
 | 
					 | 
				
			||||||
            content = f"{content_hash} content"
 | 
					 | 
				
			||||||
            loc_file = d / "root" / filename
 | 
					 | 
				
			||||||
            if not loc_file.parent.exists():
 | 
					 | 
				
			||||||
                loc_file.parent.mkdir(parents=True)
 | 
					 | 
				
			||||||
            with loc_file.open(mode="w") as file_:
 | 
					 | 
				
			||||||
                file_.write(content)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # push first version to remote storage
 | 
					 | 
				
			||||||
            remote = RemoteStorage(d / "root", str(d / "remote"))
 | 
					 | 
				
			||||||
            remote.push(filename, "aaaa", content_hash)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # retrieve with full hashes
 | 
					 | 
				
			||||||
            loc_file.unlink()
 | 
					 | 
				
			||||||
            remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
 | 
					 | 
				
			||||||
            with loc_file.open(mode="r") as file_:
 | 
					 | 
				
			||||||
                assert file_.read() == content
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # retrieve with command hash
 | 
					 | 
				
			||||||
            loc_file.unlink()
 | 
					 | 
				
			||||||
            remote.pull(filename, command_hash="aaaa")
 | 
					 | 
				
			||||||
            with loc_file.open(mode="r") as file_:
 | 
					 | 
				
			||||||
                assert file_.read() == content
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # retrieve with content hash
 | 
					 | 
				
			||||||
            loc_file.unlink()
 | 
					 | 
				
			||||||
            remote.pull(filename, content_hash=content_hash)
 | 
					 | 
				
			||||||
            with loc_file.open(mode="r") as file_:
 | 
					 | 
				
			||||||
                assert file_.read() == content
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # retrieve with no hashes
 | 
					 | 
				
			||||||
            loc_file.unlink()
 | 
					 | 
				
			||||||
            remote.pull(filename)
 | 
					 | 
				
			||||||
            with loc_file.open(mode="r") as file_:
 | 
					 | 
				
			||||||
                assert file_.read() == content
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_local_remote_storage_pull_missing():
 | 
					 | 
				
			||||||
    # pulling from a non-existent remote pulls nothing gracefully
 | 
					 | 
				
			||||||
    with make_tempdir() as d:
 | 
					 | 
				
			||||||
        filename = "a.txt"
 | 
					 | 
				
			||||||
        remote = RemoteStorage(d / "root", str(d / "remote"))
 | 
					 | 
				
			||||||
        assert remote.pull(filename, command_hash="aaaa") is None
 | 
					 | 
				
			||||||
        assert remote.pull(filename) is None
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_cli_find_threshold(capsys):
 | 
					def test_cli_find_threshold(capsys):
 | 
				
			||||||
    def make_examples(nlp: Language) -> List[Example]:
 | 
					    def make_examples(nlp: Language) -> List[Example]:
 | 
				
			||||||
        docs: List[Example] = []
 | 
					        docs: List[Example] = []
 | 
				
			||||||
| 
						 | 
					@ -1206,63 +974,6 @@ def test_cli_find_threshold(capsys):
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 | 
					 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					 | 
				
			||||||
    "reqs,output",
 | 
					 | 
				
			||||||
    [
 | 
					 | 
				
			||||||
        [
 | 
					 | 
				
			||||||
            """
 | 
					 | 
				
			||||||
            spacy
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # comment
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            thinc""",
 | 
					 | 
				
			||||||
            (False, False),
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
        [
 | 
					 | 
				
			||||||
            """# comment
 | 
					 | 
				
			||||||
            --some-flag
 | 
					 | 
				
			||||||
            spacy""",
 | 
					 | 
				
			||||||
            (False, False),
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
        [
 | 
					 | 
				
			||||||
            """# comment
 | 
					 | 
				
			||||||
            --some-flag
 | 
					 | 
				
			||||||
            spacy; python_version >= '3.6'""",
 | 
					 | 
				
			||||||
            (False, False),
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
        [
 | 
					 | 
				
			||||||
            """# comment
 | 
					 | 
				
			||||||
             spacyunknowndoesnotexist12345""",
 | 
					 | 
				
			||||||
            (True, False),
 | 
					 | 
				
			||||||
        ],
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_project_check_requirements(reqs, output):
 | 
					 | 
				
			||||||
    import pkg_resources
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # excessive guard against unlikely package name
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        pkg_resources.require("spacyunknowndoesnotexist12345")
 | 
					 | 
				
			||||||
    except pkg_resources.DistributionNotFound:
 | 
					 | 
				
			||||||
        assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_upload_download_local_file():
 | 
					 | 
				
			||||||
    with make_tempdir() as d1, make_tempdir() as d2:
 | 
					 | 
				
			||||||
        filename = "f.txt"
 | 
					 | 
				
			||||||
        content = "content"
 | 
					 | 
				
			||||||
        local_file = d1 / filename
 | 
					 | 
				
			||||||
        remote_file = d2 / filename
 | 
					 | 
				
			||||||
        with local_file.open(mode="w") as file_:
 | 
					 | 
				
			||||||
            file_.write(content)
 | 
					 | 
				
			||||||
        upload_file(local_file, remote_file)
 | 
					 | 
				
			||||||
        local_file.unlink()
 | 
					 | 
				
			||||||
        download_file(remote_file, local_file)
 | 
					 | 
				
			||||||
        with local_file.open(mode="r") as file_:
 | 
					 | 
				
			||||||
            assert file_.read() == content
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_walk_directory():
 | 
					def test_walk_directory():
 | 
				
			||||||
    with make_tempdir() as d:
 | 
					    with make_tempdir() as d:
 | 
				
			||||||
        files = [
 | 
					        files = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
| 
						 | 
					@ -213,6 +214,9 @@ def test_project_clone(options):
 | 
				
			||||||
        assert (out / "README.md").is_file()
 | 
					        assert (out / "README.md").is_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.skipif(
 | 
				
			||||||
 | 
					    sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
def test_project_push_pull(project_dir):
 | 
					def test_project_push_pull(project_dir):
 | 
				
			||||||
    proj = dict(SAMPLE_PROJECT)
 | 
					    proj = dict(SAMPLE_PROJECT)
 | 
				
			||||||
    remote = "xyz"
 | 
					    remote = "xyz"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: embedsignature=True, profile=True, binding=True
 | 
					# cython: embedsignature=True, binding=True
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from cython.operator cimport dereference as deref
 | 
					from cython.operator cimport dereference as deref
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, bounds_check=False, profile=True
 | 
					# cython: infer_types=True, bounds_check=False
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,6 +8,7 @@ from typing import (
 | 
				
			||||||
    List,
 | 
					    List,
 | 
				
			||||||
    Optional,
 | 
					    Optional,
 | 
				
			||||||
    Protocol,
 | 
					    Protocol,
 | 
				
			||||||
 | 
					    Sequence,
 | 
				
			||||||
    Tuple,
 | 
					    Tuple,
 | 
				
			||||||
    Union,
 | 
					    Union,
 | 
				
			||||||
    overload,
 | 
					    overload,
 | 
				
			||||||
| 
						 | 
					@ -134,7 +135,12 @@ class Doc:
 | 
				
			||||||
    def text(self) -> str: ...
 | 
					    def text(self) -> str: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def text_with_ws(self) -> str: ...
 | 
					    def text_with_ws(self) -> str: ...
 | 
				
			||||||
    ents: Tuple[Span]
 | 
					    # Ideally the getter would output Tuple[Span]
 | 
				
			||||||
 | 
					    # see https://github.com/python/mypy/issues/3004
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def ents(self) -> Sequence[Span]: ...
 | 
				
			||||||
 | 
					    @ents.setter
 | 
				
			||||||
 | 
					    def ents(self, value: Sequence[Span]) -> None: ...
 | 
				
			||||||
    def set_ents(
 | 
					    def set_ents(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        entities: List[Span],
 | 
					        entities: List[Span],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True, bounds_check=False, profile=True
 | 
					# cython: infer_types=True, bounds_check=False
 | 
				
			||||||
from typing import Set
 | 
					from typing import Set
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 | 
					# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from typing import Generator, List, Tuple
 | 
					from typing import Generator, List, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import copy
 | 
					import copy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
import struct
 | 
					import struct
 | 
				
			||||||
import weakref
 | 
					import weakref
 | 
				
			||||||
from copy import deepcopy
 | 
					from copy import deepcopy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
# Compiler crashes on memory view coercion without this. Should report bug.
 | 
					# Compiler crashes on memory view coercion without this. Should report bug.
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
from itertools import chain
 | 
					from itertools import chain
 | 
				
			||||||
from typing import List, Tuple
 | 
					from typing import List, Tuple
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from typing import List
 | 
					from typing import List
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -63,7 +63,7 @@ def create_plain_text_reader(
 | 
				
			||||||
    path: Optional[Path],
 | 
					    path: Optional[Path],
 | 
				
			||||||
    min_length: int = 0,
 | 
					    min_length: int = 0,
 | 
				
			||||||
    max_length: int = 0,
 | 
					    max_length: int = 0,
 | 
				
			||||||
) -> Callable[["Language"], Iterable[Doc]]:
 | 
					) -> Callable[["Language"], Iterable[Example]]:
 | 
				
			||||||
    """Iterate Example objects from a file or directory of plain text
 | 
					    """Iterate Example objects from a file or directory of plain text
 | 
				
			||||||
    UTF-8 files with one line per doc.
 | 
					    UTF-8 files with one line per doc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										66
									
								
								spacy/training/example.pyi
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								spacy/training/example.pyi
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,66 @@
 | 
				
			||||||
 | 
					from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..tokens import Doc, Span
 | 
				
			||||||
 | 
					from ..vocab import Vocab
 | 
				
			||||||
 | 
					from .alignment import Alignment
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def annotations_to_doc(
 | 
				
			||||||
 | 
					    vocab: Vocab,
 | 
				
			||||||
 | 
					    tok_annot: Dict[str, Any],
 | 
				
			||||||
 | 
					    doc_annot: Dict[str, Any],
 | 
				
			||||||
 | 
					) -> Doc: ...
 | 
				
			||||||
 | 
					def validate_examples(
 | 
				
			||||||
 | 
					    examples: Iterable[Example],
 | 
				
			||||||
 | 
					    method: str,
 | 
				
			||||||
 | 
					) -> None: ...
 | 
				
			||||||
 | 
					def validate_get_examples(
 | 
				
			||||||
 | 
					    get_examples: Callable[[], Iterable[Example]],
 | 
				
			||||||
 | 
					    method: str,
 | 
				
			||||||
 | 
					): ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Example:
 | 
				
			||||||
 | 
					    x: Doc
 | 
				
			||||||
 | 
					    y: Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        predicted: Doc,
 | 
				
			||||||
 | 
					        reference: Doc,
 | 
				
			||||||
 | 
					        *,
 | 
				
			||||||
 | 
					        alignment: Optional[Alignment] = None,
 | 
				
			||||||
 | 
					    ): ...
 | 
				
			||||||
 | 
					    def __len__(self) -> int: ...
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def predicted(self) -> Doc: ...
 | 
				
			||||||
 | 
					    @predicted.setter
 | 
				
			||||||
 | 
					    def predicted(self, doc: Doc) -> None: ...
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def reference(self) -> Doc: ...
 | 
				
			||||||
 | 
					    @reference.setter
 | 
				
			||||||
 | 
					    def reference(self, doc: Doc) -> None: ...
 | 
				
			||||||
 | 
					    def copy(self) -> Example: ...
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def from_dict(cls, predicted: Doc, example_dict: Dict[str, Any]) -> Example: ...
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def alignment(self) -> Alignment: ...
 | 
				
			||||||
 | 
					    def get_aligned(self, field: str, as_string=False): ...
 | 
				
			||||||
 | 
					    def get_aligned_parse(self, projectivize=True): ...
 | 
				
			||||||
 | 
					    def get_aligned_sent_starts(self): ...
 | 
				
			||||||
 | 
					    def get_aligned_spans_x2y(
 | 
				
			||||||
 | 
					        self, x_spans: Iterable[Span], allow_overlap=False
 | 
				
			||||||
 | 
					    ) -> List[Span]: ...
 | 
				
			||||||
 | 
					    def get_aligned_spans_y2x(
 | 
				
			||||||
 | 
					        self, y_spans: Iterable[Span], allow_overlap=False
 | 
				
			||||||
 | 
					    ) -> List[Span]: ...
 | 
				
			||||||
 | 
					    def get_aligned_ents_and_ner(self) -> Tuple[List[Span], List[str]]: ...
 | 
				
			||||||
 | 
					    def get_aligned_ner(self) -> List[str]: ...
 | 
				
			||||||
 | 
					    def get_matching_ents(self, check_label: bool = True) -> List[Span]: ...
 | 
				
			||||||
 | 
					    def to_dict(self) -> Dict[str, Any]: ...
 | 
				
			||||||
 | 
					    def split_sents(self) -> List[Example]: ...
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def text(self) -> str: ...
 | 
				
			||||||
 | 
					    def __str__(self) -> str: ...
 | 
				
			||||||
 | 
					    def __repr__(self) -> str: ...
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _parse_example_dict_data(example_dict): ...
 | 
				
			||||||
 | 
					def _fix_legacy_dict_data(example_dict): ...
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
from collections.abc import Iterable as IterableInstance
 | 
					from collections.abc import Iterable as IterableInstance
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -302,7 +302,7 @@ def read_vectors(
 | 
				
			||||||
            shape = (truncate_vectors, shape[1])
 | 
					            shape = (truncate_vectors, shape[1])
 | 
				
			||||||
    vectors_data = numpy.zeros(shape=shape, dtype="f")
 | 
					    vectors_data = numpy.zeros(shape=shape, dtype="f")
 | 
				
			||||||
    vectors_keys = []
 | 
					    vectors_keys = []
 | 
				
			||||||
    for i, line in enumerate(tqdm.tqdm(f)):
 | 
					    for i, line in enumerate(tqdm.tqdm(f, disable=None)):
 | 
				
			||||||
        line = line.rstrip()
 | 
					        line = line.rstrip()
 | 
				
			||||||
        pieces = line.rsplit(" ", vectors_data.shape[1])
 | 
					        pieces = line.rsplit(" ", vectors_data.shape[1])
 | 
				
			||||||
        word = pieces.pop(0)
 | 
					        word = pieces.pop(0)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					# cython: profile=False
 | 
				
			||||||
| 
						 | 
					@ -101,7 +101,6 @@ logger.addHandler(logger_stream_handler)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ENV_VARS:
 | 
					class ENV_VARS:
 | 
				
			||||||
    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
 | 
					    CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
 | 
				
			||||||
    PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class registry(thinc.registry):
 | 
					class registry(thinc.registry):
 | 
				
			||||||
| 
						 | 
					@ -119,6 +118,7 @@ class registry(thinc.registry):
 | 
				
			||||||
    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
 | 
					    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
 | 
				
			||||||
    loggers = catalogue.create("spacy", "loggers", entry_points=True)
 | 
					    loggers = catalogue.create("spacy", "loggers", entry_points=True)
 | 
				
			||||||
    scorers = catalogue.create("spacy", "scorers", entry_points=True)
 | 
					    scorers = catalogue.create("spacy", "scorers", entry_points=True)
 | 
				
			||||||
 | 
					    vectors = catalogue.create("spacy", "vectors", entry_points=True)
 | 
				
			||||||
    # These are factories registered via third-party packages and the
 | 
					    # These are factories registered via third-party packages and the
 | 
				
			||||||
    # spacy_factories entry point. This registry only exists so we can easily
 | 
					    # spacy_factories entry point. This registry only exists so we can easily
 | 
				
			||||||
    # load them via the entry points. The "true" factories are added via the
 | 
					    # load them via the entry points. The "true" factories are added via the
 | 
				
			||||||
| 
						 | 
					@ -974,23 +974,12 @@ def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def split_command(command: str) -> List[str]:
 | 
					def split_command(command: str) -> List[str]:
 | 
				
			||||||
    """Split a string command using shlex. Handles platform compatibility.
 | 
					    """Split a string command using shlex. Handles platform compatibility.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    command (str) : The command to split
 | 
					    command (str) : The command to split
 | 
				
			||||||
    RETURNS (List[str]): The split command.
 | 
					    RETURNS (List[str]): The split command.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    return shlex.split(command, posix=not is_windows)
 | 
					    return shlex.split(command, posix=not is_windows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def join_command(command: List[str]) -> str:
 | 
					 | 
				
			||||||
    """Join a command using shlex. shlex.join is only available for Python 3.8+,
 | 
					 | 
				
			||||||
    so we're using a workaround here.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    command (List[str]): The command to join.
 | 
					 | 
				
			||||||
    RETURNS (str): The joined command
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    return " ".join(shlex.quote(cmd) for cmd in command)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def run_command(
 | 
					def run_command(
 | 
				
			||||||
    command: Union[str, List[str]],
 | 
					    command: Union[str, List[str]],
 | 
				
			||||||
    *,
 | 
					    *,
 | 
				
			||||||
| 
						 | 
					@ -999,7 +988,6 @@ def run_command(
 | 
				
			||||||
) -> subprocess.CompletedProcess:
 | 
					) -> subprocess.CompletedProcess:
 | 
				
			||||||
    """Run a command on the command line as a subprocess. If the subprocess
 | 
					    """Run a command on the command line as a subprocess. If the subprocess
 | 
				
			||||||
    returns a non-zero exit code, a system exit is performed.
 | 
					    returns a non-zero exit code, a system exit is performed.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    command (str / List[str]): The command. If provided as a string, the
 | 
					    command (str / List[str]): The command. If provided as a string, the
 | 
				
			||||||
        string will be split using shlex.split.
 | 
					        string will be split using shlex.split.
 | 
				
			||||||
    stdin (Optional[Any]): stdin to read from or None.
 | 
					    stdin (Optional[Any]): stdin to read from or None.
 | 
				
			||||||
| 
						 | 
					@ -1050,7 +1038,6 @@ def run_command(
 | 
				
			||||||
@contextmanager
 | 
					@contextmanager
 | 
				
			||||||
def working_dir(path: Union[str, Path]) -> Iterator[Path]:
 | 
					def working_dir(path: Union[str, Path]) -> Iterator[Path]:
 | 
				
			||||||
    """Change current working directory and returns to previous on exit.
 | 
					    """Change current working directory and returns to previous on exit.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    path (str / Path): The directory to navigate to.
 | 
					    path (str / Path): The directory to navigate to.
 | 
				
			||||||
    YIELDS (Path): The absolute path to the current working directory. This
 | 
					    YIELDS (Path): The absolute path to the current working directory. This
 | 
				
			||||||
        should be used if the block needs to perform actions within the working
 | 
					        should be used if the block needs to perform actions within the working
 | 
				
			||||||
| 
						 | 
					@ -1069,7 +1056,6 @@ def working_dir(path: Union[str, Path]) -> Iterator[Path]:
 | 
				
			||||||
def make_tempdir() -> Generator[Path, None, None]:
 | 
					def make_tempdir() -> Generator[Path, None, None]:
 | 
				
			||||||
    """Execute a block in a temporary directory and remove the directory and
 | 
					    """Execute a block in a temporary directory and remove the directory and
 | 
				
			||||||
    its contents at the end of the with block.
 | 
					    its contents at the end of the with block.
 | 
				
			||||||
 | 
					 | 
				
			||||||
    YIELDS (Path): The path of the temp directory.
 | 
					    YIELDS (Path): The path of the temp directory.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    d = Path(tempfile.mkdtemp())
 | 
					    d = Path(tempfile.mkdtemp())
 | 
				
			||||||
| 
						 | 
					@ -1082,20 +1068,14 @@ def make_tempdir() -> Generator[Path, None, None]:
 | 
				
			||||||
        rmfunc(path)
 | 
					        rmfunc(path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
 | 
					        if sys.version_info >= (3, 12):
 | 
				
			||||||
 | 
					            shutil.rmtree(str(d), onexc=force_remove)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
            shutil.rmtree(str(d), onerror=force_remove)
 | 
					            shutil.rmtree(str(d), onerror=force_remove)
 | 
				
			||||||
    except PermissionError as e:
 | 
					    except PermissionError as e:
 | 
				
			||||||
        warnings.warn(Warnings.W091.format(dir=d, msg=e))
 | 
					        warnings.warn(Warnings.W091.format(dir=d, msg=e))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_cwd(path: Union[Path, str]) -> bool:
 | 
					 | 
				
			||||||
    """Check whether a path is the current working directory.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    path (Union[Path, str]): The directory path.
 | 
					 | 
				
			||||||
    RETURNS (bool): Whether the path is the current working directory.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def is_in_jupyter() -> bool:
 | 
					def is_in_jupyter() -> bool:
 | 
				
			||||||
    """Check if user is running spaCy from a Jupyter notebook by detecting the
 | 
					    """Check if user is running spaCy from a Jupyter notebook by detecting the
 | 
				
			||||||
    IPython kernel. Mainly used for the displaCy visualizer.
 | 
					    IPython kernel. Mainly used for the displaCy visualizer.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,6 @@
 | 
				
			||||||
 | 
					# cython: infer_types=True, binding=True
 | 
				
			||||||
 | 
					from typing import Callable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cython.operator cimport dereference as deref
 | 
					from cython.operator cimport dereference as deref
 | 
				
			||||||
from libc.stdint cimport uint32_t, uint64_t
 | 
					from libc.stdint cimport uint32_t, uint64_t
 | 
				
			||||||
from libcpp.set cimport set as cppset
 | 
					from libcpp.set cimport set as cppset
 | 
				
			||||||
| 
						 | 
					@ -5,7 +8,8 @@ from murmurhash.mrmr cimport hash128_x64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from enum import Enum
 | 
					from enum import Enum
 | 
				
			||||||
from typing import cast
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import TYPE_CHECKING, Union, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
| 
						 | 
					@ -21,6 +25,9 @@ from .attrs import IDS
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Errors, Warnings
 | 
				
			||||||
from .strings import get_string_id
 | 
					from .strings import get_string_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if TYPE_CHECKING:
 | 
				
			||||||
 | 
					    from .vocab import Vocab  # noqa: F401  # no-cython-lint
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def unpickle_vectors(bytes_data):
 | 
					def unpickle_vectors(bytes_data):
 | 
				
			||||||
    return Vectors().from_bytes(bytes_data)
 | 
					    return Vectors().from_bytes(bytes_data)
 | 
				
			||||||
| 
						 | 
					@ -35,7 +42,71 @@ class Mode(str, Enum):
 | 
				
			||||||
        return list(cls.__members__.keys())
 | 
					        return list(cls.__members__.keys())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Vectors:
 | 
					cdef class BaseVectors:
 | 
				
			||||||
 | 
					    def __init__(self, *, strings=None):
 | 
				
			||||||
 | 
					        # Make sure abstract BaseVectors is not instantiated.
 | 
				
			||||||
 | 
					        if self.__class__ == BaseVectors:
 | 
				
			||||||
 | 
					            raise TypeError(
 | 
				
			||||||
 | 
					                Errors.E1046.format(cls_name=self.__class__.__name__)
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __getitem__(self, key):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __contains__(self, key):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def is_full(self):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_batch(self, keys):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def shape(self):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __len__(self):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def vectors_length(self):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def size(self):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add(self, key, *, vector=None):
 | 
				
			||||||
 | 
					        raise NotImplementedError
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_ops(self, ops: Ops):
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
 | 
				
			||||||
 | 
					    # allow serialization
 | 
				
			||||||
 | 
					    def to_bytes(self, **kwargs):
 | 
				
			||||||
 | 
					        return b""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_bytes(self, data: bytes, **kwargs):
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_disk(self, path: Union[str, Path], **kwargs):
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_disk(self, path: Union[str, Path], **kwargs):
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@util.registry.vectors("spacy.Vectors.v1")
 | 
				
			||||||
 | 
					def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]:
 | 
				
			||||||
 | 
					    def vectors_factory(vocab: "Vocab") -> BaseVectors:
 | 
				
			||||||
 | 
					        return Vectors(strings=vocab.strings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return vectors_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef class Vectors(BaseVectors):
 | 
				
			||||||
    """Store, save and load word vectors.
 | 
					    """Store, save and load word vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Vectors data is kept in the vectors.data attribute, which should be an
 | 
					    Vectors data is kept in the vectors.data attribute, which should be an
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -94,6 +93,7 @@ cdef class Vocab:
 | 
				
			||||||
            return self._vectors
 | 
					            return self._vectors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def __set__(self, vectors):
 | 
					        def __set__(self, vectors):
 | 
				
			||||||
 | 
					            if hasattr(vectors, "strings"):
 | 
				
			||||||
                for s in vectors.strings:
 | 
					                for s in vectors.strings:
 | 
				
			||||||
                    self.strings.add(s)
 | 
					                    self.strings.add(s)
 | 
				
			||||||
            self._vectors = vectors
 | 
					            self._vectors = vectors
 | 
				
			||||||
| 
						 | 
					@ -193,7 +193,7 @@ cdef class Vocab:
 | 
				
			||||||
        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
 | 
					        lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
 | 
				
			||||||
        lex.orth = self.strings.add(string)
 | 
					        lex.orth = self.strings.add(string)
 | 
				
			||||||
        lex.length = len(string)
 | 
					        lex.length = len(string)
 | 
				
			||||||
        if self.vectors is not None:
 | 
					        if self.vectors is not None and hasattr(self.vectors, "key2row"):
 | 
				
			||||||
            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
 | 
					            lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            lex.id = OOV_RANK
 | 
					            lex.id = OOV_RANK
 | 
				
			||||||
| 
						 | 
					@ -289,12 +289,17 @@ cdef class Vocab:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def vectors_length(self):
 | 
					    def vectors_length(self):
 | 
				
			||||||
 | 
					        if hasattr(self.vectors, "shape"):
 | 
				
			||||||
            return self.vectors.shape[1]
 | 
					            return self.vectors.shape[1]
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def reset_vectors(self, *, width=None, shape=None):
 | 
					    def reset_vectors(self, *, width=None, shape=None):
 | 
				
			||||||
        """Drop the current vector table. Because all vectors must be the same
 | 
					        """Drop the current vector table. Because all vectors must be the same
 | 
				
			||||||
        width, you have to call this to change the size of the vectors.
 | 
					        width, you have to call this to change the size of the vectors.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        if not isinstance(self.vectors, Vectors):
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(self.vectors)))
 | 
				
			||||||
        if width is not None and shape is not None:
 | 
					        if width is not None and shape is not None:
 | 
				
			||||||
            raise ValueError(Errors.E065.format(width=width, shape=shape))
 | 
					            raise ValueError(Errors.E065.format(width=width, shape=shape))
 | 
				
			||||||
        elif shape is not None:
 | 
					        elif shape is not None:
 | 
				
			||||||
| 
						 | 
					@ -304,6 +309,8 @@ cdef class Vocab:
 | 
				
			||||||
            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 | 
					            self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def deduplicate_vectors(self):
 | 
					    def deduplicate_vectors(self):
 | 
				
			||||||
 | 
					        if not isinstance(self.vectors, Vectors):
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E849.format(method="deduplicate_vectors", vectors_type=type(self.vectors)))
 | 
				
			||||||
        if self.vectors.mode != VectorsMode.default:
 | 
					        if self.vectors.mode != VectorsMode.default:
 | 
				
			||||||
            raise ValueError(Errors.E858.format(
 | 
					            raise ValueError(Errors.E858.format(
 | 
				
			||||||
                mode=self.vectors.mode,
 | 
					                mode=self.vectors.mode,
 | 
				
			||||||
| 
						 | 
					@ -357,6 +364,8 @@ cdef class Vocab:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/vocab#prune_vectors
 | 
					        DOCS: https://spacy.io/api/vocab#prune_vectors
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        if not isinstance(self.vectors, Vectors):
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(self.vectors)))
 | 
				
			||||||
        if self.vectors.mode != VectorsMode.default:
 | 
					        if self.vectors.mode != VectorsMode.default:
 | 
				
			||||||
            raise ValueError(Errors.E858.format(
 | 
					            raise ValueError(Errors.E858.format(
 | 
				
			||||||
                mode=self.vectors.mode,
 | 
					                mode=self.vectors.mode,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -481,6 +481,286 @@ The other arguments are shared between all versions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Accordion>
 | 
					</Accordion>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Curated Transformer architectures {id="curated-trf",source="https://github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/models/architectures.py"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The following architectures are provided by the package
 | 
				
			||||||
 | 
					[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers).
 | 
				
			||||||
 | 
					See the [usage documentation](/usage/embeddings-transformers#transformers) for
 | 
				
			||||||
 | 
					how to integrate the architectures into your training config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When loading the model
 | 
				
			||||||
 | 
					[from the Hugging Face Hub](/api/curatedtransformer#hf_trfencoder_loader), the
 | 
				
			||||||
 | 
					model config's parameters must be same as the hyperparameters used by the
 | 
				
			||||||
 | 
					pre-trained model. The
 | 
				
			||||||
 | 
					[`init fill-curated-transformer`](/api/cli#init-fill-curated-transformer) CLI
 | 
				
			||||||
 | 
					command can be used to automatically fill in these values.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.AlbertTransformer.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct an ALBERT transformer model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                           | Description                                                                              |
 | 
				
			||||||
 | 
					| ------------------------------ | ---------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
 | 
				
			||||||
 | 
					| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
 | 
				
			||||||
 | 
					| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
 | 
				
			||||||
 | 
					| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
 | 
				
			||||||
 | 
					| `embedding_width`              | Width of the embedding representations. ~~int~~                                          |
 | 
				
			||||||
 | 
					| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
 | 
				
			||||||
 | 
					| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
 | 
				
			||||||
 | 
					| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
 | 
				
			||||||
 | 
					| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
 | 
				
			||||||
 | 
					| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
 | 
				
			||||||
 | 
					| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
 | 
				
			||||||
 | 
					| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_hidden_groups`            | Number of layer groups whose constituents share parameters. ~~int~~                      |
 | 
				
			||||||
 | 
					| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
 | 
				
			||||||
 | 
					| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
 | 
				
			||||||
 | 
					| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
 | 
				
			||||||
 | 
					| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
 | 
				
			||||||
 | 
					| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
 | 
				
			||||||
 | 
					| **CREATES**                    | The model using the architecture ~~Model~~                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.BertTransformer.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a BERT transformer model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                           | Description                                                                              |
 | 
				
			||||||
 | 
					| ------------------------------ | ---------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
 | 
				
			||||||
 | 
					| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
 | 
				
			||||||
 | 
					| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
 | 
				
			||||||
 | 
					| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
 | 
				
			||||||
 | 
					| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
 | 
				
			||||||
 | 
					| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
 | 
				
			||||||
 | 
					| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
 | 
				
			||||||
 | 
					| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
 | 
				
			||||||
 | 
					| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
 | 
				
			||||||
 | 
					| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
 | 
				
			||||||
 | 
					| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
 | 
				
			||||||
 | 
					| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
 | 
				
			||||||
 | 
					| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
 | 
				
			||||||
 | 
					| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
 | 
				
			||||||
 | 
					| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
 | 
				
			||||||
 | 
					| **CREATES**                    | The model using the architecture ~~Model~~                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.CamembertTransformer.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a CamemBERT transformer model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                           | Description                                                                              |
 | 
				
			||||||
 | 
					| ------------------------------ | ---------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
 | 
				
			||||||
 | 
					| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
 | 
				
			||||||
 | 
					| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
 | 
				
			||||||
 | 
					| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
 | 
				
			||||||
 | 
					| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
 | 
				
			||||||
 | 
					| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
 | 
				
			||||||
 | 
					| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
 | 
				
			||||||
 | 
					| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
 | 
				
			||||||
 | 
					| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
 | 
				
			||||||
 | 
					| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
 | 
				
			||||||
 | 
					| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
 | 
				
			||||||
 | 
					| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
 | 
				
			||||||
 | 
					| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
 | 
				
			||||||
 | 
					| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
 | 
				
			||||||
 | 
					| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
 | 
				
			||||||
 | 
					| **CREATES**                    | The model using the architecture ~~Model~~                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.RobertaTransformer.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a RoBERTa transformer model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                           | Description                                                                              |
 | 
				
			||||||
 | 
					| ------------------------------ | ---------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
 | 
				
			||||||
 | 
					| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
 | 
				
			||||||
 | 
					| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
 | 
				
			||||||
 | 
					| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
 | 
				
			||||||
 | 
					| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
 | 
				
			||||||
 | 
					| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
 | 
				
			||||||
 | 
					| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
 | 
				
			||||||
 | 
					| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
 | 
				
			||||||
 | 
					| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
 | 
				
			||||||
 | 
					| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
 | 
				
			||||||
 | 
					| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
 | 
				
			||||||
 | 
					| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
 | 
				
			||||||
 | 
					| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
 | 
				
			||||||
 | 
					| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
 | 
				
			||||||
 | 
					| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
 | 
				
			||||||
 | 
					| **CREATES**                    | The model using the architecture ~~Model~~                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.XlmrTransformer.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a XLM-RoBERTa transformer model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                           | Description                                                                              |
 | 
				
			||||||
 | 
					| ------------------------------ | ---------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab_size`                   | Vocabulary size. ~~int~~                                                                 |
 | 
				
			||||||
 | 
					| `with_spans`                   | Callback that constructs a span generator model. ~~Callable~~                            |
 | 
				
			||||||
 | 
					| `piece_encoder`                | The piece encoder to segment input tokens. ~~Model~~                                     |
 | 
				
			||||||
 | 
					| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~                              |
 | 
				
			||||||
 | 
					| `hidden_act`                   | Activation used by the point-wise feed-forward layers. ~~str~~                           |
 | 
				
			||||||
 | 
					| `hidden_dropout_prob`          | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~       |
 | 
				
			||||||
 | 
					| `hidden_width`                 | Width of the final representations. ~~int~~                                              |
 | 
				
			||||||
 | 
					| `intermediate_width`           | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
 | 
				
			||||||
 | 
					| `layer_norm_eps`               | Epsilon for layer normalization. ~~float~~                                               |
 | 
				
			||||||
 | 
					| `max_position_embeddings`      | Maximum length of position embeddings. ~~int~~                                           |
 | 
				
			||||||
 | 
					| `model_max_length`             | Maximum length of model inputs. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_attention_heads`          | Number of self-attention heads. ~~int~~                                                  |
 | 
				
			||||||
 | 
					| `num_hidden_layers`            | Number of hidden layers. ~~int~~                                                         |
 | 
				
			||||||
 | 
					| `padding_idx`                  | Index of the padding meta-token. ~~int~~                                                 |
 | 
				
			||||||
 | 
					| `type_vocab_size`              | Type vocabulary size. ~~int~~                                                            |
 | 
				
			||||||
 | 
					| `mixed_precision`              | Use mixed-precision training. ~~bool~~                                                   |
 | 
				
			||||||
 | 
					| `grad_scaler_config`           | Configuration passed to the PyTorch gradient scaler. ~~dict~~                            |
 | 
				
			||||||
 | 
					| **CREATES**                    | The model using the architecture ~~Model~~                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.ScalarWeight.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a model that accepts a list of transformer layer outputs and returns a
 | 
				
			||||||
 | 
					weighted representation of the same.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                 | Description                                                                   |
 | 
				
			||||||
 | 
					| -------------------- | ----------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `num_layers`         | Number of transformer hidden layers. ~~int~~                                  |
 | 
				
			||||||
 | 
					| `dropout_prob`       | Dropout probability. ~~float~~                                                |
 | 
				
			||||||
 | 
					| `mixed_precision`    | Use mixed-precision training. ~~bool~~                                        |
 | 
				
			||||||
 | 
					| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~                 |
 | 
				
			||||||
 | 
					| **CREATES**          | The model using the architecture ~~Model[ScalarWeightInT, ScalarWeightOutT]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.TransformerLayersListener.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a listener layer that communicates with one or more upstream
 | 
				
			||||||
 | 
					Transformer components. This layer extracts the output of the last transformer
 | 
				
			||||||
 | 
					layer and performs pooling over the individual pieces of each `Doc` token,
 | 
				
			||||||
 | 
					returning their corresponding representations. The upstream name should either
 | 
				
			||||||
 | 
					be the wildcard string '\*', or the name of the Transformer component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In almost all cases, the wildcard string will suffice as there'll only be one
 | 
				
			||||||
 | 
					upstream Transformer component. But in certain situations, e.g: you have
 | 
				
			||||||
 | 
					disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
 | 
				
			||||||
 | 
					but a downstream task requires its own token representations, you could end up
 | 
				
			||||||
 | 
					with more than one Transformer component in the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name            | Description                                                                                                            |
 | 
				
			||||||
 | 
					| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `layers`        | The number of layers produced by the upstream transformer component, excluding the embedding layer. ~~int~~            |
 | 
				
			||||||
 | 
					| `width`         | The width of the vectors produced by the upstream transformer component. ~~int~~                                       |
 | 
				
			||||||
 | 
					| `pooling`       | Model that is used to perform pooling over the piece representations. ~~Model~~                                        |
 | 
				
			||||||
 | 
					| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~                                 |
 | 
				
			||||||
 | 
					| `grad_factor`   | Factor to multiply gradients with. ~~float~~                                                                           |
 | 
				
			||||||
 | 
					| **CREATES**     | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.LastTransformerLayerListener.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a listener layer that communicates with one or more upstream
 | 
				
			||||||
 | 
					Transformer components. This layer extracts the output of the last transformer
 | 
				
			||||||
 | 
					layer and performs pooling over the individual pieces of each Doc token,
 | 
				
			||||||
 | 
					returning their corresponding representations. The upstream name should either
 | 
				
			||||||
 | 
					be the wildcard string '\*', or the name of the Transformer component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In almost all cases, the wildcard string will suffice as there'll only be one
 | 
				
			||||||
 | 
					upstream Transformer component. But in certain situations, e.g: you have
 | 
				
			||||||
 | 
					disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
 | 
				
			||||||
 | 
					but a downstream task requires its own token representations, you could end up
 | 
				
			||||||
 | 
					with more than one Transformer component in the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name            | Description                                                                                                            |
 | 
				
			||||||
 | 
					| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `width`         | The width of the vectors produced by the upstream transformer component. ~~int~~                                       |
 | 
				
			||||||
 | 
					| `pooling`       | Model that is used to perform pooling over the piece representations. ~~Model~~                                        |
 | 
				
			||||||
 | 
					| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~                                 |
 | 
				
			||||||
 | 
					| `grad_factor`   | Factor to multiply gradients with. ~~float~~                                                                           |
 | 
				
			||||||
 | 
					| **CREATES**     | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.ScalarWeightingListener.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a listener layer that communicates with one or more upstream
 | 
				
			||||||
 | 
					Transformer components. This layer calculates a weighted representation of all
 | 
				
			||||||
 | 
					transformer layer outputs and performs pooling over the individual pieces of
 | 
				
			||||||
 | 
					each Doc token, returning their corresponding representations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Requires its upstream Transformer components to return all layer outputs from
 | 
				
			||||||
 | 
					their models. The upstream name should either be the wildcard string '\*', or
 | 
				
			||||||
 | 
					the name of the Transformer component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In almost all cases, the wildcard string will suffice as there'll only be one
 | 
				
			||||||
 | 
					upstream Transformer component. But in certain situations, e.g: you have
 | 
				
			||||||
 | 
					disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
 | 
				
			||||||
 | 
					but a downstream task requires its own token representations, you could end up
 | 
				
			||||||
 | 
					with more than one Transformer component in the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name            | Description                                                                                                            |
 | 
				
			||||||
 | 
					| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `width`         | The width of the vectors produced by the upstream transformer component. ~~int~~                                       |
 | 
				
			||||||
 | 
					| `weighting`     | Model that is used to perform the weighting of the different layer outputs. ~~Model~~                                  |
 | 
				
			||||||
 | 
					| `pooling`       | Model that is used to perform pooling over the piece representations. ~~Model~~                                        |
 | 
				
			||||||
 | 
					| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~                                 |
 | 
				
			||||||
 | 
					| `grad_factor`   | Factor to multiply gradients with. ~~float~~                                                                           |
 | 
				
			||||||
 | 
					| **CREATES**     | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.BertWordpieceEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a WordPiece piece encoder model that accepts a list of token sequences
 | 
				
			||||||
 | 
					or documents and returns a corresponding list of piece identifiers. This encoder
 | 
				
			||||||
 | 
					also splits each token on punctuation characters, as expected by most BERT
 | 
				
			||||||
 | 
					models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.ByteBpeEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a Byte-BPE piece encoder model that accepts a list of token sequences
 | 
				
			||||||
 | 
					or documents and returns a corresponding list of piece identifiers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.CamembertSentencepieceEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a SentencePiece piece encoder model that accepts a list of token
 | 
				
			||||||
 | 
					sequences or documents and returns a corresponding list of piece identifiers
 | 
				
			||||||
 | 
					with CamemBERT post-processing applied.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.CharEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a character piece encoder model that accepts a list of token sequences
 | 
				
			||||||
 | 
					or documents and returns a corresponding list of piece identifiers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.SentencepieceEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a SentencePiece piece encoder model that accepts a list of token
 | 
				
			||||||
 | 
					sequences or documents and returns a corresponding list of piece identifiers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.WordpieceEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a WordPiece piece encoder model that accepts a list of token sequences
 | 
				
			||||||
 | 
					or documents and returns a corresponding list of piece identifiers. This encoder
 | 
				
			||||||
 | 
					also splits each token on punctuation characters, as expected by most BERT
 | 
				
			||||||
 | 
					models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy-curated-transformers.XlmrSentencepieceEncoder.v1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a SentencePiece piece encoder model that accepts a list of token
 | 
				
			||||||
 | 
					sequences or documents and returns a corresponding list of piece identifiers
 | 
				
			||||||
 | 
					with XLM-RoBERTa post-processing applied.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This model must be separately initialized using an appropriate loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Pretraining architectures {id="pretrain",source="spacy/ml/models/multi_task.py"}
 | 
					## Pretraining architectures {id="pretrain",source="spacy/ml/models/multi_task.py"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The spacy `pretrain` command lets you initialize a `Tok2Vec` layer in your
 | 
					The spacy `pretrain` command lets you initialize a `Tok2Vec` layer in your
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										143
									
								
								website/docs/api/basevectors.mdx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								website/docs/api/basevectors.mdx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,143 @@
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					title: BaseVectors
 | 
				
			||||||
 | 
					teaser: Abstract class for word vectors
 | 
				
			||||||
 | 
					tag: class
 | 
				
			||||||
 | 
					source: spacy/vectors.pyx
 | 
				
			||||||
 | 
					version: 3.7
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					`BaseVectors` is an abstract class to support the development of custom vectors
 | 
				
			||||||
 | 
					implementations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For use in training with [`StaticVectors`](/api/architectures#staticvectors),
 | 
				
			||||||
 | 
					`get_batch` must be implemented. For improved performance, use efficient
 | 
				
			||||||
 | 
					batching in `get_batch` and implement `to_ops` to copy the vector data to the
 | 
				
			||||||
 | 
					current device. See an example custom implementation for
 | 
				
			||||||
 | 
					[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.\_\_init\_\_ {id="init",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Create a new vector store.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                                           |
 | 
				
			||||||
 | 
					| -------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                                                                                       |
 | 
				
			||||||
 | 
					| `strings`      | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.\_\_getitem\_\_ {id="getitem",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Get a vector by key. If the key is not found in the table, a `KeyError` should
 | 
				
			||||||
 | 
					be raised.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                                      |
 | 
				
			||||||
 | 
					| ----------- | ---------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `key`       | The key to get the vector for. ~~Union[int, str]~~               |
 | 
				
			||||||
 | 
					| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.\_\_len\_\_ {id="len",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Return the number of vectors in the table.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                 |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | The number of vectors in the table. ~~int~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Check whether there is a vector entry for the given key.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                  |
 | 
				
			||||||
 | 
					| ----------- | -------------------------------------------- |
 | 
				
			||||||
 | 
					| `key`       | The key to check. ~~int~~                    |
 | 
				
			||||||
 | 
					| **RETURNS** | Whether the key has a vector entry. ~~bool~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.add {id="add",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Add a key to the table, if possible. If no keys can be added, return `-1`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                                                         |
 | 
				
			||||||
 | 
					| ----------- | ----------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `key`       | The key to add. ~~Union[str, int]~~                                                 |
 | 
				
			||||||
 | 
					| **RETURNS** | The row the vector was added to, or `-1` if the operation is not supported. ~~int~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.shape {id="shape",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Get `(rows, dims)` tuples of number of rows and number of dimensions in the
 | 
				
			||||||
 | 
					vector table.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------ |
 | 
				
			||||||
 | 
					| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.size {id="size",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The vector size, i.e. `rows * dims`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description              |
 | 
				
			||||||
 | 
					| ----------- | ------------------------ |
 | 
				
			||||||
 | 
					| **RETURNS** | The vector size. ~~int~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.is_full {id="is_full",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Whether the vectors table is full and no slots are available for new keys.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                 |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | Whether the vectors table is full. ~~bool~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.get_batch {id="get_batch",tag="method",version="3.2"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Get the vectors for the provided keys efficiently as a batch. Required to use
 | 
				
			||||||
 | 
					the vectors with [`StaticVectors`](/api/architectures#StaticVectors) for
 | 
				
			||||||
 | 
					training.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name   | Description                             |
 | 
				
			||||||
 | 
					| ------ | --------------------------------------- |
 | 
				
			||||||
 | 
					| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.to_ops {id="to_ops",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Dummy method. Implement this to change the embedding matrix to use different
 | 
				
			||||||
 | 
					Thinc ops.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name  | Description                                              |
 | 
				
			||||||
 | 
					| ----- | -------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.to_disk {id="to_disk",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Dummy method to allow serialization. Implement to save vector data with the
 | 
				
			||||||
 | 
					pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name   | Description                                                                                                                                |
 | 
				
			||||||
 | 
					| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.from_disk {id="from_disk",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Dummy method to allow serialization. Implement to load vector data from a saved
 | 
				
			||||||
 | 
					pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                                                                     |
 | 
				
			||||||
 | 
					| ----------- | ----------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `path`      | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | 
				
			||||||
 | 
					| **RETURNS** | The modified vectors object. ~~BaseVectors~~                                                    |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.to_bytes {id="to_bytes",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Dummy method to allow serialization. Implement to serialize vector data to a
 | 
				
			||||||
 | 
					binary string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                          |
 | 
				
			||||||
 | 
					| ----------- | ---------------------------------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | The serialized form of the vectors object. ~~bytes~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## BaseVectors.from_bytes {id="from_bytes",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Dummy method to allow serialization. Implement to load vector data from a binary
 | 
				
			||||||
 | 
					string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                         |
 | 
				
			||||||
 | 
					| ----------- | ----------------------------------- |
 | 
				
			||||||
 | 
					| `data`      | The data to load from. ~~bytes~~    |
 | 
				
			||||||
 | 
					| **RETURNS** | The vectors object. ~~BaseVectors~~ |
 | 
				
			||||||
| 
						 | 
					@ -186,6 +186,29 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 | 
				
			||||||
| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | 
					| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | 
				
			||||||
| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
 | 
					| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### init fill-curated-transformer {id="init-fill-curated-transformer",version="3.7",tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Auto-fill the Hugging Face model hyperpameters and loader parameters of a
 | 
				
			||||||
 | 
					[Curated Transformer](/api/curatedtransformer) pipeline component in a
 | 
				
			||||||
 | 
					[.cfg file](/usage/training#config). The name and revision of the
 | 
				
			||||||
 | 
					[Hugging Face model](https://huggingface.co/models) can either be passed as
 | 
				
			||||||
 | 
					command-line arguments or read from the
 | 
				
			||||||
 | 
					`initialize.components.transformer.encoder_loader` config section.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					$ python -m spacy init fill-curated-transformer [base_path] [output_file] [--model-name] [--model-revision] [--pipe-name] [--code]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                     | Description                                                                                                                                                                          |
 | 
				
			||||||
 | 
					| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `base_path`              | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
 | 
				
			||||||
 | 
					| `output_file`            | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~                                                   |
 | 
				
			||||||
 | 
					| `--model-name`, `-m`     | Name of the Hugging Face model. Defaults to the model name from the encoder loader config. ~~Optional[str] (option)~~                                                                |
 | 
				
			||||||
 | 
					| `--model-revision`, `-r` | Revision of the Hugging Face model. Defaults to `main`. ~~Optional[str] (option)~~                                                                                                   |
 | 
				
			||||||
 | 
					| `--pipe-name`, `-n`      | Name of the Curated Transformer pipe whose config is to be filled. Defaults to the first transformer pipe. ~~Optional[str] (option)~~                                                |
 | 
				
			||||||
 | 
					| `--code`, `-c`           | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | 
				
			||||||
 | 
					| **CREATES**              | Complete and auto-filled config file for training.                                                                                                                                   |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### init vectors {id="init-vectors",version="3",tag="command"}
 | 
					### init vectors {id="init-vectors",version="3",tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use
 | 
					Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use
 | 
				
			||||||
| 
						 | 
					@ -1041,6 +1064,42 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
 | 
				
			||||||
| overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | 
					| overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | 
				
			||||||
| **PRINTS**              | Debugging information.                                                                                                                                                                                             |
 | 
					| **PRINTS**              | Debugging information.                                                                                                                                                                                             |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### debug pieces {id="debug-pieces",version="3.7",tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Analyze word- or sentencepiece stats.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					$ python -m spacy debug pieces [config_path] [--code] [--name] [overrides]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `config_path`  | Path to config file. ~~Union[Path, str] (positional)~~                                                                                                                                     |
 | 
				
			||||||
 | 
					| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
 | 
				
			||||||
 | 
					| `--name`, `-n` | Name of the Curated Transformer pipe whose config is to be filled. Defaults to the first transformer pipe. ~~Optional[str] (option)~~                                                      |
 | 
				
			||||||
 | 
					| overrides      | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
 | 
				
			||||||
 | 
					| **PRINTS**     | Debugging information.                                                                                                                                                                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Accordion title="Example outputs" spaced>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					$ python -m spacy debug pieces ./config.cfg
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					========================= Training corpus statistics =========================
 | 
				
			||||||
 | 
					Median token length: 1.0
 | 
				
			||||||
 | 
					Mean token length: 1.54
 | 
				
			||||||
 | 
					Token length range: [1, 13]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					======================= Development corpus statistics =======================
 | 
				
			||||||
 | 
					Median token length: 1.0
 | 
				
			||||||
 | 
					Mean token length: 1.44
 | 
				
			||||||
 | 
					Token length range: [1, 8]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Accordion>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## train {id="train",tag="command"}
 | 
					## train {id="train",tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Train a pipeline. Expects data in spaCy's
 | 
					Train a pipeline. Expects data in spaCy's
 | 
				
			||||||
| 
						 | 
					@ -1183,7 +1242,7 @@ skew. To render a sample of dependency parses in a HTML file using the
 | 
				
			||||||
`--displacy-path` argument.
 | 
					`--displacy-path` argument.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
 | 
					$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] [--per-component] [--spans-key]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                                                 | Description                                                                                                                                                                          |
 | 
					| Name                                                 | Description                                                                                                                                                                          |
 | 
				
			||||||
| 
						 | 
					@ -1197,6 +1256,7 @@ $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--
 | 
				
			||||||
| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
 | 
					| `--displacy-path`, `-dp`                             | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~                                                           |
 | 
				
			||||||
| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
 | 
					| `--displacy-limit`, `-dl`                            | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~            |
 | 
				
			||||||
| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
 | 
					| `--per-component`, `-P` <Tag variant="new">3.6</Tag> | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~                                                                                           |
 | 
				
			||||||
 | 
					| `--spans-key`, `-sk` <Tag variant="new">3.6.2</Tag>  | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~                                                                                                     |
 | 
				
			||||||
| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | 
					| `--help`, `-h`                                       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | 
				
			||||||
| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
 | 
					| **CREATES**                                          | Training results and optional metrics and visualizations.                                                                                                                            |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1484,9 +1544,9 @@ obsolete files is left up to you.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Remotes can be defined in the `remotes` section of the
 | 
					Remotes can be defined in the `remotes` section of the
 | 
				
			||||||
[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
 | 
					[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses
 | 
				
			||||||
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
 | 
					[`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate with the
 | 
				
			||||||
remote storages, so you can use any protocol that `Pathy` supports, including
 | 
					remote storages, so you can use any protocol that `cloudpathlib` supports,
 | 
				
			||||||
[S3](https://aws.amazon.com/s3/),
 | 
					including [S3](https://aws.amazon.com/s3/),
 | 
				
			||||||
[Google Cloud Storage](https://cloud.google.com/storage), and the local
 | 
					[Google Cloud Storage](https://cloud.google.com/storage), and the local
 | 
				
			||||||
filesystem, although you may need to install extra dependencies to use certain
 | 
					filesystem, although you may need to install extra dependencies to use certain
 | 
				
			||||||
protocols.
 | 
					protocols.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										572
									
								
								website/docs/api/curatedtransformer.mdx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										572
									
								
								website/docs/api/curatedtransformer.mdx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,572 @@
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					title: CuratedTransformer
 | 
				
			||||||
 | 
					teaser:
 | 
				
			||||||
 | 
					  Pipeline component for multi-task learning with Curated Transformer models
 | 
				
			||||||
 | 
					tag: class
 | 
				
			||||||
 | 
					source: github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/pipeline/transformer.py
 | 
				
			||||||
 | 
					version: 3.7
 | 
				
			||||||
 | 
					api_base_class: /api/pipe
 | 
				
			||||||
 | 
					api_string_name: curated_transformer
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox title="Important note" variant="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This component is available via the extension package
 | 
				
			||||||
 | 
					[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers).
 | 
				
			||||||
 | 
					It exposes the component via entry points, so if you have the package installed,
 | 
				
			||||||
 | 
					using `factory = "curated_transformer"` in your
 | 
				
			||||||
 | 
					[training config](/usage/training#config) will work out-of-the-box.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This pipeline component lets you use a curated set of transformer models in your
 | 
				
			||||||
 | 
					pipeline. spaCy Curated Transformers currently supports the following model
 | 
				
			||||||
 | 
					types:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- ALBERT
 | 
				
			||||||
 | 
					- BERT
 | 
				
			||||||
 | 
					- CamemBERT
 | 
				
			||||||
 | 
					- RoBERTa
 | 
				
			||||||
 | 
					- XLM-RoBERT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you want to use another type of model, use
 | 
				
			||||||
 | 
					[spacy-transformers](/api/spacy-transformers), which allows you to use all
 | 
				
			||||||
 | 
					Hugging Face transformer models with spaCy.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You will usually connect downstream components to a shared Curated Transformer
 | 
				
			||||||
 | 
					pipe using one of the Curated Transformer listener layers. This works similarly
 | 
				
			||||||
 | 
					to spaCy's [Tok2Vec](/api/tok2vec), and the
 | 
				
			||||||
 | 
					[Tok2VecListener](/api/architectures/#Tok2VecListener) sublayer. The component
 | 
				
			||||||
 | 
					assigns the output of the transformer to the `Doc`'s extension attributes. To
 | 
				
			||||||
 | 
					access the values, you can use the custom
 | 
				
			||||||
 | 
					[`Doc._.trf_data`](#assigned-attributes) attribute.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For more details, see the [usage documentation](/usage/embeddings-transformers).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Assigned Attributes {id="assigned-attributes"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The component sets the following
 | 
				
			||||||
 | 
					[custom extension attribute](/usage/processing-pipeline#custom-components-attributes):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Location         | Value                                                                      |
 | 
				
			||||||
 | 
					| ---------------- | -------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `Doc._.trf_data` | Curated Transformer outputs for the `Doc` object. ~~DocTransformerOutput~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Config and Implementation {id="config"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The default config is defined by the pipeline component factory and describes
 | 
				
			||||||
 | 
					how the component should be configured. You can override its settings via the
 | 
				
			||||||
 | 
					`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 | 
				
			||||||
 | 
					[`config.cfg` for training](/usage/training#config). See the
 | 
				
			||||||
 | 
					[model architectures](/api/architectures#curated-trf) documentation for details
 | 
				
			||||||
 | 
					on the curated transformer architectures and their arguments and
 | 
				
			||||||
 | 
					hyperparameters.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> from spacy_curated_transformers.pipeline.transformer import DEFAULT_CONFIG
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> nlp.add_pipe("curated_transformer", config=DEFAULT_CONFIG)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Setting             | Description                                                                                                                                                                                                                                        |
 | 
				
			||||||
 | 
					| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [`XlmrTransformer`](/api/architectures#curated-trf). ~~Model~~                                                                                          |
 | 
				
			||||||
 | 
					| `frozen`            | If `True`, the model's weights are frozen and no backpropagation is performed. ~~bool~~                                                                                                                                                            |
 | 
				
			||||||
 | 
					| `all_layer_outputs` | If `True`, the model returns the outputs of all the layers. Otherwise, only the output of the last layer is returned. This must be set to `True` if any of the pipe's downstream listeners require the outputs of all transformer layers. ~~bool~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					https://github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/pipeline/transformer.py
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.\_\_init\_\_ {id="init",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> # Construction via add_pipe with default model
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> # Construction via add_pipe with custom config
 | 
				
			||||||
 | 
					> config = {
 | 
				
			||||||
 | 
					>     "model": {
 | 
				
			||||||
 | 
					>         "@architectures": "spacy-curated-transformers.XlmrTransformer.v1",
 | 
				
			||||||
 | 
					>         "vocab_size": 250002,
 | 
				
			||||||
 | 
					>         "num_hidden_layers": 12,
 | 
				
			||||||
 | 
					>         "hidden_width": 768,
 | 
				
			||||||
 | 
					>         "piece_encoder": {
 | 
				
			||||||
 | 
					>             "@architectures": "spacy-curated-transformers.XlmrSentencepieceEncoder.v1"
 | 
				
			||||||
 | 
					>         }
 | 
				
			||||||
 | 
					>     }
 | 
				
			||||||
 | 
					> }
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer", config=config)
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> # Construction from class
 | 
				
			||||||
 | 
					> from spacy_curated_transformers import CuratedTransformer
 | 
				
			||||||
 | 
					> trf = CuratedTransformer(nlp.vocab, model)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a `CuratedTransformer` component. One or more subsequent spaCy
 | 
				
			||||||
 | 
					components can use the transformer outputs as features in its model, with
 | 
				
			||||||
 | 
					gradients backpropagated to the single shared weights. The activations from the
 | 
				
			||||||
 | 
					transformer are saved in the [`Doc._.trf_data`](#assigned-attributes) extension
 | 
				
			||||||
 | 
					attribute. You can also provide a callback to set additional annotations. In
 | 
				
			||||||
 | 
					your application, you would normally use a shortcut for this and instantiate the
 | 
				
			||||||
 | 
					component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                | Description                                                                                                                                                                                                                                        |
 | 
				
			||||||
 | 
					| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab`             | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                   |
 | 
				
			||||||
 | 
					| `model`             | One of the supported pre-trained transformer models. ~~Model~~                                                                                                                                                                                     |
 | 
				
			||||||
 | 
					| _keyword-only_      |                                                                                                                                                                                                                                                    |
 | 
				
			||||||
 | 
					| `name`              | The component instance name. ~~str~~                                                                                                                                                                                                               |
 | 
				
			||||||
 | 
					| `frozen`            | If `True`, the model's weights are frozen and no backpropagation is performed. ~~bool~~                                                                                                                                                            |
 | 
				
			||||||
 | 
					| `all_layer_outputs` | If `True`, the model returns the outputs of all the layers. Otherwise, only the output of the last layer is returned. This must be set to `True` if any of the pipe's downstream listeners require the outputs of all transformer layers. ~~bool~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.\_\_call\_\_ {id="call",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Apply the pipe to one document. The document is modified in place, and returned.
 | 
				
			||||||
 | 
					This usually happens under the hood when the `nlp` object is called on a text
 | 
				
			||||||
 | 
					and all pipeline components are applied to the `Doc` in order. Both
 | 
				
			||||||
 | 
					[`__call__`](/api/curatedtransformer#call) and
 | 
				
			||||||
 | 
					[`pipe`](/api/curatedtransformer#pipe) delegate to the
 | 
				
			||||||
 | 
					[`predict`](/api/curatedtransformer#predict) and
 | 
				
			||||||
 | 
					[`set_annotations`](/api/curatedtransformer#set_annotations) methods.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> doc = nlp("This is a sentence.")
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> # This usually happens under the hood
 | 
				
			||||||
 | 
					> processed = trf(doc)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                      |
 | 
				
			||||||
 | 
					| ----------- | -------------------------------- |
 | 
				
			||||||
 | 
					| `doc`       | The document to process. ~~Doc~~ |
 | 
				
			||||||
 | 
					| **RETURNS** | The processed document. ~~Doc~~  |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.pipe {id="pipe",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Apply the pipe to a stream of documents. This usually happens under the hood
 | 
				
			||||||
 | 
					when the `nlp` object is called on a text and all pipeline components are
 | 
				
			||||||
 | 
					applied to the `Doc` in order. Both [`__call__`](/api/curatedtransformer#call)
 | 
				
			||||||
 | 
					and [`pipe`](/api/curatedtransformer#pipe) delegate to the
 | 
				
			||||||
 | 
					[`predict`](/api/curatedtransformer#predict) and
 | 
				
			||||||
 | 
					[`set_annotations`](/api/curatedtransformer#set_annotations) methods.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> for doc in trf.pipe(docs, batch_size=50):
 | 
				
			||||||
 | 
					>     pass
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                   |
 | 
				
			||||||
 | 
					| -------------- | ------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                               |
 | 
				
			||||||
 | 
					| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
				
			||||||
 | 
					| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.initialize {id="initialize",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Initialize the component for training and return an
 | 
				
			||||||
 | 
					[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
 | 
				
			||||||
 | 
					function that returns an iterable of [`Example`](/api/example) objects. **At
 | 
				
			||||||
 | 
					least one example should be supplied.** The data examples are used to
 | 
				
			||||||
 | 
					**initialize the model** of the component and can either be the full training
 | 
				
			||||||
 | 
					data or a representative sample. Initialization includes validating the network,
 | 
				
			||||||
 | 
					[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
 | 
				
			||||||
 | 
					setting up the label scheme based on the data. This method is typically called
 | 
				
			||||||
 | 
					by [`Language.initialize`](/api/language#initialize).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> trf.initialize(lambda: examples, nlp=nlp)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name             | Description                                                                                                                                                                |
 | 
				
			||||||
 | 
					| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `get_examples`   | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
 | 
				
			||||||
 | 
					| _keyword-only_   |                                                                                                                                                                            |
 | 
				
			||||||
 | 
					| `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                       |
 | 
				
			||||||
 | 
					| `encoder_loader` | Initialization callback for the transformer model. ~~Optional[Callable]~~                                                                                                  |
 | 
				
			||||||
 | 
					| `piece_loader`   | Initialization callback for the input piece encoder. ~~Optional[Callable]~~                                                                                                |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.predict {id="predict",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 | 
				
			||||||
 | 
					modifying them.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> scores = trf.predict([doc1, doc2])
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                 |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------- |
 | 
				
			||||||
 | 
					| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
 | 
				
			||||||
 | 
					| **RETURNS** | The model's prediction for each document.   |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.set_annotations {id="set_annotations",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Assign the extracted features to the `Doc` objects. By default, the
 | 
				
			||||||
 | 
					[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object is
 | 
				
			||||||
 | 
					written to the [`Doc._.trf_data`](#assigned-attributes) attribute. Your
 | 
				
			||||||
 | 
					`set_extra_annotations` callback is then called, if provided.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> scores = trf.predict(docs)
 | 
				
			||||||
 | 
					> trf.set_annotations(docs, scores)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name     | Description                                                  |
 | 
				
			||||||
 | 
					| -------- | ------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `docs`   | The documents to modify. ~~Iterable[Doc]~~                   |
 | 
				
			||||||
 | 
					| `scores` | The scores to set, produced by `CuratedTransformer.predict`. |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.update {id="update",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Prepare for an update to the transformer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Like the [`Tok2Vec`](api/tok2vec) component, the `CuratedTransformer` component
 | 
				
			||||||
 | 
					is unusual in that it does not receive "gold standard" annotations to calculate
 | 
				
			||||||
 | 
					a weight update. The optimal output of the transformer data is unknown; it's a
 | 
				
			||||||
 | 
					hidden layer inside the network that is updated by backpropagating from output
 | 
				
			||||||
 | 
					layers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `CuratedTransformer` component therefore does not perform a weight update
 | 
				
			||||||
 | 
					during its own `update` method. Instead, it runs its transformer model and
 | 
				
			||||||
 | 
					communicates the output and the backpropagation callback to any downstream
 | 
				
			||||||
 | 
					components that have been connected to it via the transformer listener sublayer.
 | 
				
			||||||
 | 
					If there are multiple listeners, the last layer will actually backprop to the
 | 
				
			||||||
 | 
					transformer and call the optimizer, while the others simply increment the
 | 
				
			||||||
 | 
					gradients.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> optimizer = nlp.initialize()
 | 
				
			||||||
 | 
					> losses = trf.update(examples, sgd=optimizer)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                                                                                                      |
 | 
				
			||||||
 | 
					| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `examples`     | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                                                                                                                                                  |
 | 
				
			||||||
 | 
					| `drop`         | The dropout rate. ~~float~~                                                                                                                                                      |
 | 
				
			||||||
 | 
					| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
 | 
				
			||||||
 | 
					| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
 | 
				
			||||||
 | 
					| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.create_optimizer {id="create_optimizer",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Create an optimizer for the pipeline component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> optimizer = trf.create_optimizer()
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                  |
 | 
				
			||||||
 | 
					| ----------- | ---------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | The optimizer. ~~Optimizer~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.use_params {id="use_params",tag="method, contextmanager"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Modify the pipe's model to use the given parameter values. At the end of the
 | 
				
			||||||
 | 
					context, the original parameters are restored.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> with trf.use_params(optimizer.averages):
 | 
				
			||||||
 | 
					>     trf.to_disk("/best_model")
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name     | Description                                        |
 | 
				
			||||||
 | 
					| -------- | -------------------------------------------------- |
 | 
				
			||||||
 | 
					| `params` | The parameter values to use in the model. ~~dict~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.to_disk {id="to_disk",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Serialize the pipe to disk.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> trf.to_disk("/path/to/transformer")
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                                                                |
 | 
				
			||||||
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                                                                                                            |
 | 
				
			||||||
 | 
					| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.from_disk {id="from_disk",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Load the pipe from disk. Modifies the object in place and returns it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> trf.from_disk("/path/to/transformer")
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                     |
 | 
				
			||||||
 | 
					| -------------- | ----------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                                                                 |
 | 
				
			||||||
 | 
					| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
 | 
				
			||||||
 | 
					| **RETURNS**    | The modified `CuratedTransformer` object. ~~CuratedTransformer~~                                |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.to_bytes {id="to_bytes",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> trf_bytes = trf.to_bytes()
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Serialize the pipe to a bytestring.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                 |
 | 
				
			||||||
 | 
					| -------------- | ------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                                                             |
 | 
				
			||||||
 | 
					| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | 
				
			||||||
 | 
					| **RETURNS**    | The serialized form of the `CuratedTransformer` object. ~~bytes~~                           |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## CuratedTransformer.from_bytes {id="from_bytes",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Load the pipe from a bytestring. Modifies the object in place and returns it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> trf_bytes = trf.to_bytes()
 | 
				
			||||||
 | 
					> trf = nlp.add_pipe("curated_transformer")
 | 
				
			||||||
 | 
					> trf.from_bytes(trf_bytes)
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                 |
 | 
				
			||||||
 | 
					| -------------- | ------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
 | 
				
			||||||
 | 
					| _keyword-only_ |                                                                                             |
 | 
				
			||||||
 | 
					| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
 | 
				
			||||||
 | 
					| **RETURNS**    | The `CuratedTransformer` object. ~~CuratedTransformer~~                                     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Serialization Fields {id="serialization-fields"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					During serialization, spaCy will export several data fields used to restore
 | 
				
			||||||
 | 
					different aspects of the object. If needed, you can exclude them from
 | 
				
			||||||
 | 
					serialization by passing in the string names via the `exclude` argument.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> data = trf.to_disk("/path", exclude=["vocab"])
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name    | Description                                                    |
 | 
				
			||||||
 | 
					| ------- | -------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab` | The shared [`Vocab`](/api/vocab).                              |
 | 
				
			||||||
 | 
					| `cfg`   | The config file. You usually don't want to exclude this.       |
 | 
				
			||||||
 | 
					| `model` | The binary model data. You usually don't want to exclude this. |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## DocTransformerOutput {id="doctransformeroutput",tag="dataclass"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Curated Transformer outputs for one `Doc` object. Stores the dense
 | 
				
			||||||
 | 
					representations generated by the transformer for each piece identifier. Piece
 | 
				
			||||||
 | 
					identifiers are grouped by token. Instances of this class are typically assigned
 | 
				
			||||||
 | 
					to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
 | 
				
			||||||
 | 
					attribute.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name              | Description                                                                                                                                                                        |
 | 
				
			||||||
 | 
					| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
 | 
				
			||||||
 | 
					| `last_layer_only` | If only the last transformer layer's outputs are preserved. ~~bool~~                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### DocTransformerOutput.embedding_layer {id="doctransformeroutput-embeddinglayer",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Return the output of the transformer's embedding layer or `None` if
 | 
				
			||||||
 | 
					`last_layer_only` is `True`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                  |
 | 
				
			||||||
 | 
					| ----------- | -------------------------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | Embedding layer output. ~~Optional[Ragged]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### DocTransformerOutput.last_hidden_layer_state {id="doctransformeroutput-lasthiddenlayerstate",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Return the output of the transformer's last hidden layer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                          |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------ |
 | 
				
			||||||
 | 
					| **RETURNS** | Last hidden layer output. ~~Ragged~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### DocTransformerOutput.all_hidden_layer_states {id="doctransformeroutput-allhiddenlayerstates",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Return the outputs of all transformer layers (excluding the embedding layer).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                            |
 | 
				
			||||||
 | 
					| ----------- | -------------------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | Hidden layer outputs. ~~List[Ragged]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### DocTransformerOutput.num_outputs {id="doctransformeroutput-numoutputs",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Return the number of layer outputs stored in the `DocTransformerOutput` instance
 | 
				
			||||||
 | 
					(including the embedding layer).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                |
 | 
				
			||||||
 | 
					| ----------- | -------------------------- |
 | 
				
			||||||
 | 
					| **RETURNS** | Numbef of outputs. ~~int~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Span Getters {id="span_getters",source="github.com/explosion/spacy-transformers/blob/master/spacy_curated_transformers/span_getters.py"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
 | 
				
			||||||
 | 
					return a lists of [`Span`](/api/span) objects for each doc to be processed by
 | 
				
			||||||
 | 
					the transformer. This is used to manage long documents by cutting them into
 | 
				
			||||||
 | 
					smaller sequences before running the transformer. The spans are allowed to
 | 
				
			||||||
 | 
					overlap, and you can also omit sections of the `Doc` if they are not relevant.
 | 
				
			||||||
 | 
					Span getters can be referenced in the
 | 
				
			||||||
 | 
					`[components.transformer.model.with_spans]` block of the config to customize the
 | 
				
			||||||
 | 
					sequences processed by the transformer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                                   |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `docs`      | A batch of `Doc` objects. ~~Iterable[Doc]~~                   |
 | 
				
			||||||
 | 
					| **RETURNS** | The spans to process by the transformer. ~~List[List[Span]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### WithStridedSpans.v1 {id="strided_spans",tag="registered function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [transformer.model.with_spans]
 | 
				
			||||||
 | 
					> @architectures = "spacy-curated-transformers.WithStridedSpans.v1"
 | 
				
			||||||
 | 
					> stride = 96
 | 
				
			||||||
 | 
					> window = 128
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Create a span getter for strided spans. If you set the `window` and `stride` to
 | 
				
			||||||
 | 
					the same value, the spans will cover each token once. Setting `stride` lower
 | 
				
			||||||
 | 
					than `window` will allow for an overlap, so that some tokens are counted twice.
 | 
				
			||||||
 | 
					This can be desirable, because it allows all tokens to have both a left and
 | 
				
			||||||
 | 
					right context.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name     | Description              |
 | 
				
			||||||
 | 
					| -------- | ------------------------ |
 | 
				
			||||||
 | 
					| `window` | The window size. ~~int~~ |
 | 
				
			||||||
 | 
					| `stride` | The stride size. ~~int~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Model Loaders
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[Curated Transformer models](/api/architectures#curated-trf) are constructed
 | 
				
			||||||
 | 
					with default hyperparameters and randomized weights when the pipeline is
 | 
				
			||||||
 | 
					created. To load the weights of an existing pre-trained model into the pipeline,
 | 
				
			||||||
 | 
					one of the following loader callbacks can be used. The pre-trained model must
 | 
				
			||||||
 | 
					have the same hyperparameters as the model used by the pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### HFTransformerEncoderLoader.v1 {id="hf_trfencoder_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a supported transformer model with weights
 | 
				
			||||||
 | 
					from a corresponding HuggingFace model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name       | Description                                |
 | 
				
			||||||
 | 
					| ---------- | ------------------------------------------ |
 | 
				
			||||||
 | 
					| `name`     | Name of the HuggingFace model. ~~str~~     |
 | 
				
			||||||
 | 
					| `revision` | Name of the model revision/branch. ~~str~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### PyTorchCheckpointLoader.v1 {id="pytorch_checkpoint_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a supported transformer model with weights
 | 
				
			||||||
 | 
					from a PyTorch checkpoint.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name   | Description                              |
 | 
				
			||||||
 | 
					| ------ | ---------------------------------------- |
 | 
				
			||||||
 | 
					| `path` | Path to the PyTorch checkpoint. ~~Path~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Tokenizer Loaders
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[Curated Transformer models](/api/architectures#curated-trf) must be paired with
 | 
				
			||||||
 | 
					a matching tokenizer (piece encoder) model in a spaCy pipeline. As with the
 | 
				
			||||||
 | 
					transformer models, tokenizers are constructed with an empty vocabulary during
 | 
				
			||||||
 | 
					pipeline creation - They need to be initialized with an appropriate loader
 | 
				
			||||||
 | 
					before use in training/inference.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### ByteBPELoader.v1 {id="bytebpe_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a Byte-BPE piece encoder model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name          | Description                           |
 | 
				
			||||||
 | 
					| ------------- | ------------------------------------- |
 | 
				
			||||||
 | 
					| `vocab_path`  | Path to the vocabulary file. ~~Path~~ |
 | 
				
			||||||
 | 
					| `merges_path` | Path to the merges file. ~~Path~~     |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### CharEncoderLoader.v1 {id="charencoder_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a character piece encoder model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        | Description                                                                 |
 | 
				
			||||||
 | 
					| ----------- | --------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `path`      | Path to the serialized character model. ~~Path~~                            |
 | 
				
			||||||
 | 
					| `bos_piece` | Piece used as a beginning-of-sentence token. Defaults to `"[BOS]"`. ~~str~~ |
 | 
				
			||||||
 | 
					| `eos_piece` | Piece used as a end-of-sentence token. Defaults to `"[EOS]"`. ~~str~~       |
 | 
				
			||||||
 | 
					| `unk_piece` | Piece used as a stand-in for unknown tokens. Defaults to `"[UNK]"`. ~~str~~ |
 | 
				
			||||||
 | 
					| `normalize` | Unicode normalization form to use. Defaults to `"NFKC"`. ~~str~~            |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### HFPieceEncoderLoader.v1 {id="hf_pieceencoder_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a HuggingFace piece encoder model. Used in
 | 
				
			||||||
 | 
					conjunction with the HuggingFace model loader.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name       | Description                                |
 | 
				
			||||||
 | 
					| ---------- | ------------------------------------------ |
 | 
				
			||||||
 | 
					| `name`     | Name of the HuggingFace model. ~~str~~     |
 | 
				
			||||||
 | 
					| `revision` | Name of the model revision/branch. ~~str~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### SentencepieceLoader.v1 {id="sentencepiece_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a SentencePiece piece encoder model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name   | Description                                          |
 | 
				
			||||||
 | 
					| ------ | ---------------------------------------------------- |
 | 
				
			||||||
 | 
					| `path` | Path to the serialized SentencePiece model. ~~Path~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### WordpieceLoader.v1 {id="wordpiece_loader",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that initializes a WordPiece piece encoder model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name   | Description                                      |
 | 
				
			||||||
 | 
					| ------ | ------------------------------------------------ |
 | 
				
			||||||
 | 
					| `path` | Path to the serialized WordPiece model. ~~Path~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Callbacks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### gradual_transformer_unfreezing.v1 {id="gradual_transformer_unfreezing",tag="registered_function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Construct a callback that can be used to gradually unfreeze the weights of one
 | 
				
			||||||
 | 
					or more Transformer components during training. This can be used to prevent
 | 
				
			||||||
 | 
					catastrophic forgetting during fine-tuning.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name           | Description                                                                                                                                                                  |
 | 
				
			||||||
 | 
					| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `target_pipes` | A dictionary whose keys and values correspond to the names of Transformer components and the training step at which they should be unfrozen respectively. ~~Dict[str, int]~~ |
 | 
				
			||||||
| 
						 | 
					@ -19,8 +19,8 @@ prototyping** and **prompting**, and turning unstructured responses into
 | 
				
			||||||
An LLM component is implemented through the `LLMWrapper` class. It is accessible
 | 
					An LLM component is implemented through the `LLMWrapper` class. It is accessible
 | 
				
			||||||
through a generic `llm`
 | 
					through a generic `llm`
 | 
				
			||||||
[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 | 
					[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 | 
				
			||||||
as well as through task-specific component factories: `llm_ner`, `llm_spancat`, `llm_rel`,
 | 
					as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
 | 
				
			||||||
`llm_textcat`, `llm_sentiment` and `llm_summarization`.
 | 
					`llm_rel`, `llm_textcat`, `llm_sentiment` and `llm_summarization`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 | 
					### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -255,9 +255,11 @@ prompting.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                                   |
 | 
					| Argument                    | Description                                                                                                                                                                                   |
 | 
				
			||||||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `template`                  | Custom prompt template to send to LLM model. Defaults to [summarization.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/summarization.v1.jinja). ~~str~~ |
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [summarization.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/summarization.v1.jinja). ~~str~~ |
 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SummarizationTask]]~~                                  |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SummarizationExample`. ~~Optional[Type[FewshotExample]]~~                                                                                      |
 | 
				
			||||||
| `max_n_words`               | Maximum number of words to be used in summary. Note that this should not expected to work exactly. Defaults to `None`. ~~Optional[int]~~                                                      |
 | 
					| `max_n_words`               | Maximum number of words to be used in summary. Note that this should not expected to work exactly. Defaults to `None`. ~~Optional[int]~~                                                      |
 | 
				
			||||||
| `field`                     | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `summary`. ~~str~~                                                      |
 | 
					| `field`                     | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `summary`. ~~str~~                                                      |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -326,12 +328,15 @@ the v3 implementation will use a dummy example in the prompt. Technically this
 | 
				
			||||||
means that the task will always perform few-shot prompting under the hood.
 | 
					means that the task will always perform few-shot prompting under the hood.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                                            |
 | 
					| Argument                    | Description                                                                                                                                                                                            |
 | 
				
			||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [ner.v3.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v3.jinja). ~~str~~                              |
 | 
				
			||||||
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[NERTask]]~~                                                     |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `NERExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                         |
 | 
				
			||||||
 | 
					| `scorer`                    | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                                                   |
 | 
				
			||||||
| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
				
			||||||
| `label_definitions`         | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
					| `label_definitions`         | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
				
			||||||
| `template`                | Custom prompt template to send to LLM model. Defaults to [ner.v3.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v3.jinja). ~~str~~                              |
 | 
					 | 
				
			||||||
| `description` (NEW)         | A description of what to recognize or not recognize as entities. ~~str~~                                                                                                                               |
 | 
					| `description` (NEW)         | A description of what to recognize or not recognize as entities. ~~str~~                                                                                                                               |
 | 
				
			||||||
| `examples`                | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
					 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~                              |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~                              |
 | 
				
			||||||
| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
					| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
				
			||||||
| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
					| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
				
			||||||
| 
						 | 
					@ -416,11 +421,14 @@ v1.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                                            |
 | 
					| Argument                    | Description                                                                                                                                                                                            |
 | 
				
			||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `labels`                  | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
					 | 
				
			||||||
| `label_definitions` (NEW) | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
					 | 
				
			||||||
| `template` (NEW)            | Custom prompt template to send to LLM model. Defaults to [ner.v2.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v2.jinja). ~~str~~                              |
 | 
					| `template` (NEW)            | Custom prompt template to send to LLM model. Defaults to [ner.v2.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/ner.v2.jinja). ~~str~~                              |
 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[NERTask]]~~                                                     |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `NERExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                         |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                                                   |
 | 
				
			||||||
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
				
			||||||
 | 
					| `label_definitions` (NEW)   | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~                              |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~                              |
 | 
				
			||||||
| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
					| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
				
			||||||
| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
					| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
				
			||||||
| 
						 | 
					@ -468,9 +476,12 @@ few-shot prompting.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                    |
 | 
					| Argument                    | Description                                                                                                                                                                    |
 | 
				
			||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `labels`                  | Comma-separated list of labels. ~~str~~                                                                                                                                        |
 | 
					 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                 |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                 |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[NERTask]]~~                             |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `NERExample`. ~~Optional[Type[FewshotExample]]~~                                                                                 |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                           |
 | 
				
			||||||
 | 
					| `labels`                    | Comma-separated list of labels. ~~str~~                                                                                                                                        |
 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                          |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                          |
 | 
				
			||||||
| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ |
 | 
					| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ |
 | 
				
			||||||
| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                      |
 | 
					| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                      |
 | 
				
			||||||
| 
						 | 
					@ -540,13 +551,16 @@ support overlapping entities and store its annotations in `doc.spans`.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                                            |
 | 
					| Argument                    | Description                                                                                                                                                                                            |
 | 
				
			||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [`spancat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v3.jinja). ~~str~~                    |
 | 
				
			||||||
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~                                                 |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SpanCatExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                     |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                                                   |
 | 
				
			||||||
| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
				
			||||||
| `label_definitions`         | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
					| `label_definitions`         | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
				
			||||||
| `template`                | Custom prompt template to send to LLM model. Defaults to [`spancat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v3.jinja). ~~str~~                    |
 | 
					 | 
				
			||||||
| `description` (NEW)         | A description of what to recognize or not recognize as entities. ~~str~~                                                                                                                               |
 | 
					| `description` (NEW)         | A description of what to recognize or not recognize as entities. ~~str~~                                                                                                                               |
 | 
				
			||||||
| `spans_key`                 | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~                                                                                                                       |
 | 
					| `spans_key`                 | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~                                                                                                                       |
 | 
				
			||||||
| `examples`                | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
					 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                                                  |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                                                  |
 | 
				
			||||||
| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
					| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
				
			||||||
| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
					| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
				
			||||||
| 
						 | 
					@ -569,12 +583,15 @@ support overlapping entities and store its annotations in `doc.spans`.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                                            |
 | 
					| Argument                    | Description                                                                                                                                                                                            |
 | 
				
			||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `template` (NEW)            | Custom prompt template to send to LLM model. Defaults to [`spancat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v2.jinja). ~~str~~                    |
 | 
				
			||||||
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~                                                 |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SpanCatExample`. ~~Optional[Type[FewshotExample]]~~                                                                                                     |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                                                   |
 | 
				
			||||||
| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                                     |
 | 
				
			||||||
| `label_definitions` (NEW)   | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
					| `label_definitions` (NEW)   | Optional dict mapping a label to a description of that label. These descriptions are added to the prompt to help instruct the LLM on what to extract. Defaults to `None`. ~~Optional[Dict[str, str]]~~ |
 | 
				
			||||||
| `template` (NEW)          | Custom prompt template to send to LLM model. Defaults to [`spancat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/spancat.v2.jinja). ~~str~~                    |
 | 
					 | 
				
			||||||
| `spans_key`                 | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~                                                                                                                       |
 | 
					| `spans_key`                 | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~                                                                                                                       |
 | 
				
			||||||
| `examples`                | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                                         |
 | 
					 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                                                  |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                                                  |
 | 
				
			||||||
| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
					| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~                         |
 | 
				
			||||||
| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
					| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                                              |
 | 
				
			||||||
| 
						 | 
					@ -600,10 +617,13 @@ v1 NER task to support overlapping entities and store its annotations in
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                    |
 | 
					| Argument                    | Description                                                                                                                                                                    |
 | 
				
			||||||
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                 |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~                         |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SpanCatExample`. ~~Optional[Type[FewshotExample]]~~                                                                             |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                           |
 | 
				
			||||||
| `labels`                    | Comma-separated list of labels. ~~str~~                                                                                                                                        |
 | 
					| `labels`                    | Comma-separated list of labels. ~~str~~                                                                                                                                        |
 | 
				
			||||||
| `spans_key`                 | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~                                                                                               |
 | 
					| `spans_key`                 | Key of the `Doc.spans` dict to save the spans under. Defaults to `"sc"`. ~~str~~                                                                                               |
 | 
				
			||||||
| `examples`                | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                 |
 | 
					 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                          |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, defaults to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                          |
 | 
				
			||||||
| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ |
 | 
					| `alignment_mode`            | Alignment mode in case the LLM returns entities that do not align with token boundaries. Options are `"strict"`, `"contract"` or `"expand"`. Defaults to `"contract"`. ~~str~~ |
 | 
				
			||||||
| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                      |
 | 
					| `case_sensitive_matching`   | Whether to search without case sensitivity. Defaults to `False`. ~~bool~~                                                                                                      |
 | 
				
			||||||
| 
						 | 
					@ -637,11 +657,14 @@ prompt.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                         |
 | 
					| Argument                    | Description                                                                                                                                                                         |
 | 
				
			||||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `labels`                  | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                  |
 | 
					 | 
				
			||||||
| `label_definitions` (NEW) | Dictionary of label definitions. Included in the prompt, if set. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                                   |
 | 
					 | 
				
			||||||
| `template`                  | Custom prompt template to send to LLM model. Defaults to [`textcat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v3.jinja). ~~str~~ |
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [`textcat.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v3.jinja). ~~str~~ |
 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                      |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                      |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~                              |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `TextCatExample`. ~~Optional[Type[FewshotExample]]~~                                                                                  |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                                |
 | 
				
			||||||
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                  |
 | 
				
			||||||
 | 
					| `label_definitions` (NEW)   | Dictionary of label definitions. Included in the prompt, if set. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                                   |
 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~         |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~         |
 | 
				
			||||||
| `exclusive_classes`         | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~                              |
 | 
					| `exclusive_classes`         | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~                              |
 | 
				
			||||||
| `allow_none`                | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~       |
 | 
					| `allow_none`                | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~       |
 | 
				
			||||||
| 
						 | 
					@ -664,10 +687,13 @@ V2 includes all v1 functionality, with an improved prompt template.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                         |
 | 
					| Argument                    | Description                                                                                                                                                                         |
 | 
				
			||||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `labels`            | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                  |
 | 
					 | 
				
			||||||
| `template` (NEW)            | Custom prompt template to send to LLM model. Defaults to [`textcat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v2.jinja). ~~str~~ |
 | 
					| `template` (NEW)            | Custom prompt template to send to LLM model. Defaults to [`textcat.v2.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/textcat.v2.jinja). ~~str~~ |
 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                      |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                      |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~                              |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `TextCatExample`. ~~Optional[Type[FewshotExample]]~~                                                                                  |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                                |
 | 
				
			||||||
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                                  |
 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                             |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                             |
 | 
				
			||||||
| `exclusive_classes`         | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~                              |
 | 
					| `exclusive_classes`         | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~                              |
 | 
				
			||||||
| `allow_none`                | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~       |
 | 
					| `allow_none`                | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~       |
 | 
				
			||||||
| 
						 | 
					@ -691,13 +717,16 @@ prompting.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                   |
 | 
					| Argument                    | Description                                                                                                                                                                   |
 | 
				
			||||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `labels`            | Comma-separated list of labels. ~~str~~                                                                                                                                       |
 | 
					 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Deafults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Deafults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SpanCatTask]]~~                        |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `TextCatExample`. ~~Optional[Type[FewshotExample]]~~                                                                            |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                          |
 | 
				
			||||||
 | 
					| `labels`                    | Comma-separated list of labels. ~~str~~                                                                                                                                       |
 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                       |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. ~~Optional[Callable[[str], str]]~~                       |
 | 
				
			||||||
| `exclusive_classes` | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Deafults to `False`. ~~bool~~                        |
 | 
					| `exclusive_classes`         | If set to `True`, only one label per document should be valid. If set to `False`, one document can have multiple labels. Defaults to `False`. ~~bool~~                        |
 | 
				
			||||||
| `allow_none`        | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Deafults to `True`. ~~bool~~ |
 | 
					| `allow_none`                | When set to `True`, allows the LLM to not return any of the given label. The resulting dict in `doc.cats` will have `0.0` scores for all labels. Defaults to `True`. ~~bool~~ |
 | 
				
			||||||
| `verbose`           | If set to `True`, warnings will be generated when the LLM returns invalid responses. Deafults to `False`. ~~bool~~                                                            |
 | 
					| `verbose`                   | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~                                                            |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To perform [few-shot learning](/usage/large-language-models#few-shot-prompts),
 | 
					To perform [few-shot learning](/usage/large-language-models#few-shot-prompts),
 | 
				
			||||||
you can write down a few examples in a separate file, and provide these to be
 | 
					you can write down a few examples in a separate file, and provide these to be
 | 
				
			||||||
| 
						 | 
					@ -741,11 +770,14 @@ on an upstream NER component for entities extraction.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                 |
 | 
					| Argument                    | Description                                                                                                                                                                 |
 | 
				
			||||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `labels`            | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                          |
 | 
					 | 
				
			||||||
| `template`                  | Custom prompt template to send to LLM model. Defaults to [`rel.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/rel.v1.jinja). ~~str~~ |
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [`rel.v3.jinja`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/rel.v1.jinja). ~~str~~ |
 | 
				
			||||||
| `label_definitions` | Dictionary providing a description for each relation label. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                                |
 | 
					 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                              |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                              |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[RELTask]]~~                          |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `RELExample`. ~~Optional[Type[FewshotExample]]~~                                                                              |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                        |
 | 
				
			||||||
 | 
					| `labels`                    | List of labels or str of comma-separated list of labels. ~~Union[List[str], str]~~                                                                                          |
 | 
				
			||||||
 | 
					| `label_definitions`         | Dictionary providing a description for each relation label. Defaults to `None`. ~~Optional[Dict[str, str]]~~                                                                |
 | 
				
			||||||
| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ |
 | 
					| `normalizer`                | Function that normalizes the labels as returned by the LLM. If `None`, falls back to `spacy.LowercaseNormalizer.v1`. Defaults to `None`. ~~Optional[Callable[[str], str]]~~ |
 | 
				
			||||||
| `verbose`                   | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~                                                          |
 | 
					| `verbose`                   | If set to `True`, warnings will be generated when the LLM returns invalid responses. Defaults to `False`. ~~bool~~                                                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -794,9 +826,12 @@ This task supports both zero-shot and few-shot prompting.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                                                   |
 | 
					| Argument                    | Description                                                                                                                                                                   |
 | 
				
			||||||
| ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `template`                  | Custom prompt template to send to LLM model. Defaults to [lemma.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/lemma.v1.jinja). ~~str~~ |
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [lemma.v1.jinja](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/tasks/templates/lemma.v1.jinja). ~~str~~ |
 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                                                |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[LemmaTask]]~~                          |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `LemmaExample`. ~~Optional[Type[FewshotExample]]~~                                                                              |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The task prompts the LLM to lemmatize the passed text and return the lemmatized
 | 
					The task prompts the LLM to lemmatize the passed text and return the lemmatized
 | 
				
			||||||
version as a list of tokens and their corresponding lemma. E. g. the text
 | 
					version as a list of tokens and their corresponding lemma. E. g. the text
 | 
				
			||||||
| 
						 | 
					@ -871,9 +906,12 @@ This task supports both zero-shot and few-shot prompting.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                    | Description                                                                                                                                              |
 | 
					| Argument                    | Description                                                                                                                                              |
 | 
				
			||||||
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `template`                  | Custom prompt template to send to LLM model. Defaults to [sentiment.v1.jinja](./spacy_llm/tasks/templates/sentiment.v1.jinja). ~~str~~                   |
 | 
					| `template`                  | Custom prompt template to send to LLM model. Defaults to [sentiment.v1.jinja](./spacy_llm/tasks/templates/sentiment.v1.jinja). ~~str~~                   |
 | 
				
			||||||
| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                           |
 | 
					| `examples`                  | Optional function that generates examples for few-shot learning. Defaults to `None`. ~~Optional[Callable[[], Iterable[Any]]]~~                           |
 | 
				
			||||||
 | 
					| `parse_responses` (NEW)     | Callable for parsing LLM responses for this task. Defaults to the internal parsing method for this task. ~~Optional[TaskResponseParser[SentimentTask]]~~ |
 | 
				
			||||||
 | 
					| `prompt_example_type` (NEW) | Type to use for fewshot examples. Defaults to `SentimentExample`. ~~Optional[Type[FewshotExample]]~~                                                     |
 | 
				
			||||||
 | 
					| `scorer` (NEW)              | Scorer function that evaluates the task performance on provided examples. Defaults to the metric used by spaCy. ~~Optional[Scorer]~~                     |
 | 
				
			||||||
| `field`                     | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `sentiment`. ~~str~~               |
 | 
					| `field`                     | Name of extension attribute to store summary in (i. e. the summary will be available in `doc._.{field}`). Defaults to `sentiment`. ~~str~~               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To perform [few-shot learning](/usage/large-language-models#few-shot-prompts),
 | 
					To perform [few-shot learning](/usage/large-language-models#few-shot-prompts),
 | 
				
			||||||
| 
						 | 
					@ -953,11 +991,11 @@ provider's API.
 | 
				
			||||||
Currently, these models are provided as part of the core library:
 | 
					Currently, these models are provided as part of the core library:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Model                         | Provider          | Supported names                                                                                                    | Default name           | Default config                       |
 | 
					| Model                         | Provider          | Supported names                                                                                                    | Default name           | Default config                       |
 | 
				
			||||||
| ----------------------------- | --------- | ---------------------------------------------------------------------------------------- | ---------------------- | ------------------------------------ |
 | 
					| ----------------------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------ | ---------------------- | ------------------------------------ |
 | 
				
			||||||
| `spacy.GPT-4.v1`              | OpenAI            | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]`                                                           | `"gpt-4"`              | `{}`                                 |
 | 
					| `spacy.GPT-4.v1`              | OpenAI            | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]`                                                           | `"gpt-4"`              | `{}`                                 |
 | 
				
			||||||
| `spacy.GPT-4.v2`              | OpenAI            | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]`                                                           | `"gpt-4"`              | `{temperature=0.0}`                  |
 | 
					| `spacy.GPT-4.v2`              | OpenAI            | `["gpt-4", "gpt-4-0314", "gpt-4-32k", "gpt-4-32k-0314"]`                                                           | `"gpt-4"`              | `{temperature=0.0}`                  |
 | 
				
			||||||
| `spacy.GPT-3-5.v1`            | OpenAI    | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k"]` | `"gpt-3.5-turbo"`      | `{}`                                 |
 | 
					| `spacy.GPT-3-5.v1`            | OpenAI            | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"`      | `{}`                                 |
 | 
				
			||||||
| `spacy.GPT-3-5.v2`            | OpenAI    | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k"]` | `"gpt-3.5-turbo"`      | `{temperature=0.0}`                  |
 | 
					| `spacy.GPT-3-5.v2`            | OpenAI            | `["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-16k", "gpt-3.5-turbo-instruct"]` | `"gpt-3.5-turbo"`      | `{temperature=0.0}`                  |
 | 
				
			||||||
| `spacy.Davinci.v1`            | OpenAI            | `["davinci"]`                                                                                                      | `"davinci"`            | `{}`                                 |
 | 
					| `spacy.Davinci.v1`            | OpenAI            | `["davinci"]`                                                                                                      | `"davinci"`            | `{}`                                 |
 | 
				
			||||||
| `spacy.Davinci.v2`            | OpenAI            | `["davinci"]`                                                                                                      | `"davinci"`            | `{temperature=0.0, max_tokens=500}`  |
 | 
					| `spacy.Davinci.v2`            | OpenAI            | `["davinci"]`                                                                                                      | `"davinci"`            | `{temperature=0.0, max_tokens=500}`  |
 | 
				
			||||||
| `spacy.Text-Davinci.v1`       | OpenAI            | `["text-davinci-003", "text-davinci-002"]`                                                                         | `"text-davinci-003"`   | `{}`                                 |
 | 
					| `spacy.Text-Davinci.v1`       | OpenAI            | `["text-davinci-003", "text-davinci-002"]`                                                                         | `"text-davinci-003"`   | `{}`                                 |
 | 
				
			||||||
| 
						 | 
					@ -976,6 +1014,7 @@ Currently, these models are provided as part of the core library:
 | 
				
			||||||
| `spacy.Ada.v2`                | OpenAI            | `["ada"]`                                                                                                          | `"ada"`                | `{temperature=0.0, max_tokens=500}`  |
 | 
					| `spacy.Ada.v2`                | OpenAI            | `["ada"]`                                                                                                          | `"ada"`                | `{temperature=0.0, max_tokens=500}`  |
 | 
				
			||||||
| `spacy.Text-Ada.v1`           | OpenAI            | `["text-ada-001"]`                                                                                                 | `"text-ada-001"`       | `{}`                                 |
 | 
					| `spacy.Text-Ada.v1`           | OpenAI            | `["text-ada-001"]`                                                                                                 | `"text-ada-001"`       | `{}`                                 |
 | 
				
			||||||
| `spacy.Text-Ada.v2`           | OpenAI            | `["text-ada-001"]`                                                                                                 | `"text-ada-001"`       | `{temperature=0.0, max_tokens=500}`  |
 | 
					| `spacy.Text-Ada.v2`           | OpenAI            | `["text-ada-001"]`                                                                                                 | `"text-ada-001"`       | `{temperature=0.0, max_tokens=500}`  |
 | 
				
			||||||
 | 
					| `spacy.Azure.v1`              | Microsoft, OpenAI | Arbitrary values                                                                                                   | No default             | `{temperature=0.0}`                  |
 | 
				
			||||||
| `spacy.Command.v1`            | Cohere            | `["command", "command-light", "command-light-nightly", "command-nightly"]`                                         | `"command"`            | `{}`                                 |
 | 
					| `spacy.Command.v1`            | Cohere            | `["command", "command-light", "command-light-nightly", "command-nightly"]`                                         | `"command"`            | `{}`                                 |
 | 
				
			||||||
| `spacy.Claude-2.v1`           | Anthropic         | `["claude-2", "claude-2-100k"]`                                                                                    | `"claude-2"`           | `{}`                                 |
 | 
					| `spacy.Claude-2.v1`           | Anthropic         | `["claude-2", "claude-2-100k"]`                                                                                    | `"claude-2"`           | `{}`                                 |
 | 
				
			||||||
| `spacy.Claude-1.v1`           | Anthropic         | `["claude-1", "claude-1-100k"]`                                                                                    | `"claude-1"`           | `{}`                                 |
 | 
					| `spacy.Claude-1.v1`           | Anthropic         | `["claude-1", "claude-1-100k"]`                                                                                    | `"claude-1"`           | `{}`                                 |
 | 
				
			||||||
| 
						 | 
					@ -984,10 +1023,29 @@ Currently, these models are provided as part of the core library:
 | 
				
			||||||
| `spacy.Claude-1-3.v1`         | Anthropic         | `["claude-1.3", "claude-1.3-100k"]`                                                                                | `"claude-1.3"`         | `{}`                                 |
 | 
					| `spacy.Claude-1-3.v1`         | Anthropic         | `["claude-1.3", "claude-1.3-100k"]`                                                                                | `"claude-1.3"`         | `{}`                                 |
 | 
				
			||||||
| `spacy.Claude-instant-1.v1`   | Anthropic         | `["claude-instant-1", "claude-instant-1-100k"]`                                                                    | `"claude-instant-1"`   | `{}`                                 |
 | 
					| `spacy.Claude-instant-1.v1`   | Anthropic         | `["claude-instant-1", "claude-instant-1-100k"]`                                                                    | `"claude-instant-1"`   | `{}`                                 |
 | 
				
			||||||
| `spacy.Claude-instant-1-1.v1` | Anthropic         | `["claude-instant-1.1", "claude-instant-1.1-100k"]`                                                                | `"claude-instant-1.1"` | `{}`                                 |
 | 
					| `spacy.Claude-instant-1-1.v1` | Anthropic         | `["claude-instant-1.1", "claude-instant-1.1-100k"]`                                                                | `"claude-instant-1.1"` | `{}`                                 |
 | 
				
			||||||
 | 
					| `spacy.PaLM.v1`               | Google            | `["chat-bison-001", "text-bison-001"]`                                                                             | `"text-bison-001"`     | `{temperature=0.0}`                  |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To use these models, make sure that you've [set the relevant API](#api-keys)
 | 
					To use these models, make sure that you've [set the relevant API](#api-keys)
 | 
				
			||||||
keys as environment variables.
 | 
					keys as environment variables.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**⚠️ A note on `spacy.Azure.v1`.** Working with Azure OpenAI is slightly
 | 
				
			||||||
 | 
					different than working with models from other providers:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- In Azure LLMs have to be made available by creating a _deployment_ of a given
 | 
				
			||||||
 | 
					  model (e. g. GPT-3.5). This deployment can have an arbitrary name. The `name`
 | 
				
			||||||
 | 
					  argument, which everywhere else denotes the model name (e. g. `claude-1.0`,
 | 
				
			||||||
 | 
					  `gpt-3.5`), here refers to the _deployment name_.
 | 
				
			||||||
 | 
					- Deployed Azure OpenAI models are reachable via a resource-specific base URL,
 | 
				
			||||||
 | 
					  usually of the form `https://{resource}.openai.azure.com`. Hence the URL has
 | 
				
			||||||
 | 
					  to be specified via the `base_url` argument.
 | 
				
			||||||
 | 
					- Azure further expects the _API version_ to be specified. The default value for
 | 
				
			||||||
 | 
					  this, via the `api_version` argument, is currently `2023-05-15` but may be
 | 
				
			||||||
 | 
					  updated in the future.
 | 
				
			||||||
 | 
					- Finally, since we can't infer information about the model from the deployment
 | 
				
			||||||
 | 
					  name, `spacy-llm` requires the `model_type` to be set to either
 | 
				
			||||||
 | 
					  `"completions"` or `"chat"`, depending on whether the deployed model is a
 | 
				
			||||||
 | 
					  completion or chat model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### API Keys {id="api-keys"}
 | 
					#### API Keys {id="api-keys"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Note that when using hosted services, you have to ensure that the proper API
 | 
					Note that when using hosted services, you have to ensure that the proper API
 | 
				
			||||||
| 
						 | 
					@ -1014,6 +1072,12 @@ For Anthropic:
 | 
				
			||||||
export ANTHROPIC_API_KEY="..."
 | 
					export ANTHROPIC_API_KEY="..."
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For PaLM:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```shell
 | 
				
			||||||
 | 
					export PALM_API_KEY="..."
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Models via HuggingFace {id="models-hf"}
 | 
					### Models via HuggingFace {id="models-hf"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
These models all take the same parameters:
 | 
					These models all take the same parameters:
 | 
				
			||||||
| 
						 | 
					@ -1037,11 +1101,27 @@ Currently, these models are provided as part of the core library:
 | 
				
			||||||
| Model                | Provider        | Supported names                                                                                              | HF directory                           |
 | 
					| Model                | Provider        | Supported names                                                                                              | HF directory                           |
 | 
				
			||||||
| -------------------- | --------------- | ------------------------------------------------------------------------------------------------------------ | -------------------------------------- |
 | 
					| -------------------- | --------------- | ------------------------------------------------------------------------------------------------------------ | -------------------------------------- |
 | 
				
			||||||
| `spacy.Dolly.v1`     | Databricks      | `["dolly-v2-3b", "dolly-v2-7b", "dolly-v2-12b"]`                                                             | https://huggingface.co/databricks      |
 | 
					| `spacy.Dolly.v1`     | Databricks      | `["dolly-v2-3b", "dolly-v2-7b", "dolly-v2-12b"]`                                                             | https://huggingface.co/databricks      |
 | 
				
			||||||
| `spacy.Llama2.v1`    | Meta AI         | `["Llama-2-7b-hf", "Llama-2-13b-hf", "Llama-2-70b-hf"]`                                                      | https://huggingface.co/meta-llama      |
 | 
					 | 
				
			||||||
| `spacy.Falcon.v1`    | TII             | `["falcon-rw-1b", "falcon-7b", "falcon-7b-instruct", "falcon-40b-instruct"]`                                 | https://huggingface.co/tiiuae          |
 | 
					| `spacy.Falcon.v1`    | TII             | `["falcon-rw-1b", "falcon-7b", "falcon-7b-instruct", "falcon-40b-instruct"]`                                 | https://huggingface.co/tiiuae          |
 | 
				
			||||||
 | 
					| `spacy.Llama2.v1`    | Meta AI         | `["Llama-2-7b-hf", "Llama-2-13b-hf", "Llama-2-70b-hf"]`                                                      | https://huggingface.co/meta-llama      |
 | 
				
			||||||
 | 
					| `spacy.Mistral.v1`   | Mistral AI      | `["Mistral-7B-v0.1", "Mistral-7B-Instruct-v0.1"]`                                                            | https://huggingface.co/mistralai       |
 | 
				
			||||||
| `spacy.StableLM.v1`  | Stability AI    | `["stablelm-base-alpha-3b", "stablelm-base-alpha-7b", "stablelm-tuned-alpha-3b", "stablelm-tuned-alpha-7b"]` | https://huggingface.co/stabilityai     |
 | 
					| `spacy.StableLM.v1`  | Stability AI    | `["stablelm-base-alpha-3b", "stablelm-base-alpha-7b", "stablelm-tuned-alpha-3b", "stablelm-tuned-alpha-7b"]` | https://huggingface.co/stabilityai     |
 | 
				
			||||||
| `spacy.OpenLLaMA.v1` | OpenLM Research | `["open_llama_3b", "open_llama_7b", "open_llama_7b_v2", "open_llama_13b"]`                                   | https://huggingface.co/openlm-research |
 | 
					| `spacy.OpenLLaMA.v1` | OpenLM Research | `["open_llama_3b", "open_llama_7b", "open_llama_7b_v2", "open_llama_13b"]`                                   | https://huggingface.co/openlm-research |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox variant="warning" title="Gated models on Hugging Face" id="hf_licensing">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Some models available on Hugging Face (HF), such as Llama 2, are _gated models_.
 | 
				
			||||||
 | 
					That means that users have to fulfill certain requirements to be allowed access
 | 
				
			||||||
 | 
					to these models. In the case of Llama 2 you'll need to request agree to Meta's
 | 
				
			||||||
 | 
					Terms of Service while logged in with your HF account. After Meta grants you
 | 
				
			||||||
 | 
					permission to use Llama 2, you'll be able to download and use the model.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This requires that you are logged in with your HF account on your local
 | 
				
			||||||
 | 
					machine - check out the HF quick start documentation. In a nutshell, you'll need
 | 
				
			||||||
 | 
					to create an access token on HF and log in to HF using your access token, e. g.
 | 
				
			||||||
 | 
					with `huggingface-cli login`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Note that Hugging Face will download the model the first time you use it - you
 | 
					Note that Hugging Face will download the model the first time you use it - you
 | 
				
			||||||
can
 | 
					can
 | 
				
			||||||
[define the cached directory](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache)
 | 
					[define the cached directory](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -297,10 +297,9 @@ The vector size, i.e. `rows * dims`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Vectors.is_full {id="is_full",tag="property"}
 | 
					## Vectors.is_full {id="is_full",tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Whether the vectors table is full and has no slots are available for new keys.
 | 
					Whether the vectors table is full and no slots are available for new keys. If a
 | 
				
			||||||
If a table is full, it can be resized using
 | 
					table is full, it can be resized using [`Vectors.resize`](/api/vectors#resize).
 | 
				
			||||||
[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always
 | 
					In `floret` mode, the table is always full and cannot be resized.
 | 
				
			||||||
full and cannot be resized.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -441,7 +440,7 @@ Load state from a binary string.
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```python
 | 
					> ```python
 | 
				
			||||||
> fron spacy.vectors import Vectors
 | 
					> from spacy.vectors import Vectors
 | 
				
			||||||
> vectors_bytes = vectors.to_bytes()
 | 
					> vectors_bytes = vectors.to_bytes()
 | 
				
			||||||
> new_vectors = Vectors(StringStore())
 | 
					> new_vectors = Vectors(StringStore())
 | 
				
			||||||
> new_vectors.from_bytes(vectors_bytes)
 | 
					> new_vectors.from_bytes(vectors_bytes)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -632,6 +632,165 @@ def MyCustomVectors(
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### Creating a custom vectors implementation {id="custom-vectors",version="3.7"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can specify a custom registered vectors class under `[nlp.vectors]` in order
 | 
				
			||||||
 | 
					to use static vectors in formats other than the ones supported by
 | 
				
			||||||
 | 
					[`Vectors`](/api/vectors). Extend the abstract [`BaseVectors`](/api/basevectors)
 | 
				
			||||||
 | 
					class to implement your custom vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					As an example, the following `BPEmbVectors` class implements support for
 | 
				
			||||||
 | 
					[BPEmb subword embeddings](https://bpemb.h-its.org/):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					# requires: pip install bpemb
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Callable, Optional, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from bpemb import BPEmb
 | 
				
			||||||
 | 
					from thinc.api import Ops, get_current_ops
 | 
				
			||||||
 | 
					from thinc.backends import get_array_ops
 | 
				
			||||||
 | 
					from thinc.types import Floats2d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.strings import StringStore
 | 
				
			||||||
 | 
					from spacy.util import registry
 | 
				
			||||||
 | 
					from spacy.vectors import BaseVectors
 | 
				
			||||||
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class BPEmbVectors(BaseVectors):
 | 
				
			||||||
 | 
					    def __init__(
 | 
				
			||||||
 | 
					        self,
 | 
				
			||||||
 | 
					        *,
 | 
				
			||||||
 | 
					        strings: Optional[StringStore] = None,
 | 
				
			||||||
 | 
					        lang: Optional[str] = None,
 | 
				
			||||||
 | 
					        vs: Optional[int] = None,
 | 
				
			||||||
 | 
					        dim: Optional[int] = None,
 | 
				
			||||||
 | 
					        cache_dir: Optional[Path] = None,
 | 
				
			||||||
 | 
					        encode_extra_options: Optional[str] = None,
 | 
				
			||||||
 | 
					        model_file: Optional[Path] = None,
 | 
				
			||||||
 | 
					        emb_file: Optional[Path] = None,
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        kwargs = {}
 | 
				
			||||||
 | 
					        if lang is not None:
 | 
				
			||||||
 | 
					            kwargs["lang"] = lang
 | 
				
			||||||
 | 
					        if vs is not None:
 | 
				
			||||||
 | 
					            kwargs["vs"] = vs
 | 
				
			||||||
 | 
					        if dim is not None:
 | 
				
			||||||
 | 
					            kwargs["dim"] = dim
 | 
				
			||||||
 | 
					        if cache_dir is not None:
 | 
				
			||||||
 | 
					            kwargs["cache_dir"] = cache_dir
 | 
				
			||||||
 | 
					        if encode_extra_options is not None:
 | 
				
			||||||
 | 
					            kwargs["encode_extra_options"] = encode_extra_options
 | 
				
			||||||
 | 
					        if model_file is not None:
 | 
				
			||||||
 | 
					            kwargs["model_file"] = model_file
 | 
				
			||||||
 | 
					        if emb_file is not None:
 | 
				
			||||||
 | 
					            kwargs["emb_file"] = emb_file
 | 
				
			||||||
 | 
					        self.bpemb = BPEmb(**kwargs)
 | 
				
			||||||
 | 
					        self.strings = strings
 | 
				
			||||||
 | 
					        self.name = repr(self.bpemb)
 | 
				
			||||||
 | 
					        self.n_keys = -1
 | 
				
			||||||
 | 
					        self.mode = "BPEmb"
 | 
				
			||||||
 | 
					        self.to_ops(get_current_ops())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __contains__(self, key):
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def is_full(self):
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def add(self, key, *, vector=None, row=None):
 | 
				
			||||||
 | 
					        warnings.warn(
 | 
				
			||||||
 | 
					            (
 | 
				
			||||||
 | 
					                "Skipping BPEmbVectors.add: the bpemb vector table cannot be "
 | 
				
			||||||
 | 
					                "modified. Vectors are calculated from bytepieces."
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        return -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __getitem__(self, key):
 | 
				
			||||||
 | 
					        return self.get_batch([key])[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_batch(self, keys):
 | 
				
			||||||
 | 
					        keys = [self.strings.as_string(key) for key in keys]
 | 
				
			||||||
 | 
					        bp_ids = self.bpemb.encode_ids(keys)
 | 
				
			||||||
 | 
					        ops = get_array_ops(self.bpemb.emb.vectors)
 | 
				
			||||||
 | 
					        indices = ops.asarray(ops.xp.hstack(bp_ids), dtype="int32")
 | 
				
			||||||
 | 
					        lengths = ops.asarray([len(x) for x in bp_ids], dtype="int32")
 | 
				
			||||||
 | 
					        vecs = ops.reduce_mean(cast(Floats2d, self.bpemb.emb.vectors[indices]), lengths)
 | 
				
			||||||
 | 
					        return vecs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def shape(self):
 | 
				
			||||||
 | 
					        return self.bpemb.vectors.shape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __len__(self):
 | 
				
			||||||
 | 
					        return self.shape[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def vectors_length(self):
 | 
				
			||||||
 | 
					        return self.shape[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def size(self):
 | 
				
			||||||
 | 
					        return self.bpemb.vectors.size
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_ops(self, ops: Ops):
 | 
				
			||||||
 | 
					        self.bpemb.emb.vectors = ops.asarray(self.bpemb.emb.vectors)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.vectors("BPEmbVectors.v1")
 | 
				
			||||||
 | 
					def create_bpemb_vectors(
 | 
				
			||||||
 | 
					    lang: Optional[str] = "multi",
 | 
				
			||||||
 | 
					    vs: Optional[int] = None,
 | 
				
			||||||
 | 
					    dim: Optional[int] = None,
 | 
				
			||||||
 | 
					    cache_dir: Optional[Path] = None,
 | 
				
			||||||
 | 
					    encode_extra_options: Optional[str] = None,
 | 
				
			||||||
 | 
					    model_file: Optional[Path] = None,
 | 
				
			||||||
 | 
					    emb_file: Optional[Path] = None,
 | 
				
			||||||
 | 
					) -> Callable[[Vocab], BPEmbVectors]:
 | 
				
			||||||
 | 
					    def bpemb_vectors_factory(vocab: Vocab) -> BPEmbVectors:
 | 
				
			||||||
 | 
					        return BPEmbVectors(
 | 
				
			||||||
 | 
					            strings=vocab.strings,
 | 
				
			||||||
 | 
					            lang=lang,
 | 
				
			||||||
 | 
					            vs=vs,
 | 
				
			||||||
 | 
					            dim=dim,
 | 
				
			||||||
 | 
					            cache_dir=cache_dir,
 | 
				
			||||||
 | 
					            encode_extra_options=encode_extra_options,
 | 
				
			||||||
 | 
					            model_file=model_file,
 | 
				
			||||||
 | 
					            emb_file=emb_file,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return bpemb_vectors_factory
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox variant="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Note that the serialization methods are not implemented, so the embeddings are
 | 
				
			||||||
 | 
					loaded from your local cache or downloaded by `BPEmb` each time the pipeline is
 | 
				
			||||||
 | 
					loaded.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To use this in your pipeline, specify this registered function under
 | 
				
			||||||
 | 
					`[nlp.vectors]` in your config:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```ini
 | 
				
			||||||
 | 
					[nlp.vectors]
 | 
				
			||||||
 | 
					@vectors = "BPEmbVectors.v1"
 | 
				
			||||||
 | 
					lang = "en"
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Or specify it when creating a blank pipeline:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```python
 | 
				
			||||||
 | 
					nlp = spacy.blank("en", config={"nlp.vectors": {"@vectors": "BPEmbVectors.v1", "lang": "en"}})
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Remember to include this code with `--code` when using
 | 
				
			||||||
 | 
					[`spacy train`](/api/cli#train) and [`spacy package`](/api/cli#package).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Pretraining {id="pretraining"}
 | 
					## Pretraining {id="pretraining"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
 | 
					The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,7 +20,7 @@ menu:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Installation instructions {id="installation"}
 | 
					## Installation instructions {id="installation"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
 | 
					spaCy is compatible with **64-bit CPython 3.7+** and runs on **Unix/Linux**,
 | 
				
			||||||
**macOS/OS X** and **Windows**. The latest spaCy releases are available over
 | 
					**macOS/OS X** and **Windows**. The latest spaCy releases are available over
 | 
				
			||||||
[pip](https://pypi.python.org/pypi/spacy) and
 | 
					[pip](https://pypi.python.org/pypi/spacy) and
 | 
				
			||||||
[conda](https://anaconda.org/conda-forge/spacy).
 | 
					[conda](https://anaconda.org/conda-forge/spacy).
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -170,8 +170,8 @@ to be `"databricks/dolly-v2-12b"` for better performance.
 | 
				
			||||||
### Example 3: Create the component directly in Python {id="example-3"}
 | 
					### Example 3: Create the component directly in Python {id="example-3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The `llm` component behaves as any other component does, and there are
 | 
					The `llm` component behaves as any other component does, and there are
 | 
				
			||||||
[task-specific components](/api/large-language-models#config) defined to
 | 
					[task-specific components](/api/large-language-models#config) defined to help
 | 
				
			||||||
help you hit the ground running with a reasonable built-in task implementation.
 | 
					you hit the ground running with a reasonable built-in task implementation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
| 
						 | 
					@ -436,7 +436,7 @@ respectively. Alternatively you can use LangChain to access hosted or local
 | 
				
			||||||
models by specifying one of the models registered with the `langchain.` prefix.
 | 
					models by specifying one of the models registered with the `langchain.` prefix.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Infobox>
 | 
					<Infobox>
 | 
				
			||||||
_Why LangChain if there are also are a native REST and a HuggingFace interface? When should I use what?_
 | 
					_Why LangChain if there are also are native REST and HuggingFace interfaces? When should I use what?_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Third-party libraries like `langchain` focus on prompt management, integration
 | 
					Third-party libraries like `langchain` focus on prompt management, integration
 | 
				
			||||||
of many different LLM APIs, and other related features such as conversational
 | 
					of many different LLM APIs, and other related features such as conversational
 | 
				
			||||||
| 
						 | 
					@ -476,6 +476,7 @@ provider's documentation.
 | 
				
			||||||
| [`spacy.Curie.v2`](/api/large-language-models#models-rest)              | OpenAI’s `curie` model family.                 |
 | 
					| [`spacy.Curie.v2`](/api/large-language-models#models-rest)              | OpenAI’s `curie` model family.                 |
 | 
				
			||||||
| [`spacy.Babbage.v2`](/api/large-language-models#models-rest)            | OpenAI’s `babbage` model family.               |
 | 
					| [`spacy.Babbage.v2`](/api/large-language-models#models-rest)            | OpenAI’s `babbage` model family.               |
 | 
				
			||||||
| [`spacy.Ada.v2`](/api/large-language-models#models-rest)                | OpenAI’s `ada` model family.                   |
 | 
					| [`spacy.Ada.v2`](/api/large-language-models#models-rest)                | OpenAI’s `ada` model family.                   |
 | 
				
			||||||
 | 
					| [`spacy.Azure.v1`](/api/large-language-models#models-rest)              | Azure's OpenAI models.                         |
 | 
				
			||||||
| [`spacy.Command.v1`](/api/large-language-models#models-rest)            | Cohere’s `command` model family.               |
 | 
					| [`spacy.Command.v1`](/api/large-language-models#models-rest)            | Cohere’s `command` model family.               |
 | 
				
			||||||
| [`spacy.Claude-2.v1`](/api/large-language-models#models-rest)           | Anthropic’s `claude-2` model family.           |
 | 
					| [`spacy.Claude-2.v1`](/api/large-language-models#models-rest)           | Anthropic’s `claude-2` model family.           |
 | 
				
			||||||
| [`spacy.Claude-1.v1`](/api/large-language-models#models-rest)           | Anthropic’s `claude-1` model family.           |
 | 
					| [`spacy.Claude-1.v1`](/api/large-language-models#models-rest)           | Anthropic’s `claude-1` model family.           |
 | 
				
			||||||
| 
						 | 
					@ -484,8 +485,10 @@ provider's documentation.
 | 
				
			||||||
| [`spacy.Claude-1-0.v1`](/api/large-language-models#models-rest)         | Anthropic’s `claude-1.0` model family.         |
 | 
					| [`spacy.Claude-1-0.v1`](/api/large-language-models#models-rest)         | Anthropic’s `claude-1.0` model family.         |
 | 
				
			||||||
| [`spacy.Claude-1-2.v1`](/api/large-language-models#models-rest)         | Anthropic’s `claude-1.2` model family.         |
 | 
					| [`spacy.Claude-1-2.v1`](/api/large-language-models#models-rest)         | Anthropic’s `claude-1.2` model family.         |
 | 
				
			||||||
| [`spacy.Claude-1-3.v1`](/api/large-language-models#models-rest)         | Anthropic’s `claude-1.3` model family.         |
 | 
					| [`spacy.Claude-1-3.v1`](/api/large-language-models#models-rest)         | Anthropic’s `claude-1.3` model family.         |
 | 
				
			||||||
 | 
					| [`spacy.PaLM.v1`](/api/large-language-models#models-rest)               | Google’s `PaLM` model family.                  |
 | 
				
			||||||
| [`spacy.Dolly.v1`](/api/large-language-models#models-hf)                | Dolly models through HuggingFace.              |
 | 
					| [`spacy.Dolly.v1`](/api/large-language-models#models-hf)                | Dolly models through HuggingFace.              |
 | 
				
			||||||
| [`spacy.Falcon.v1`](/api/large-language-models#models-hf)               | Falcon models through HuggingFace.             |
 | 
					| [`spacy.Falcon.v1`](/api/large-language-models#models-hf)               | Falcon models through HuggingFace.             |
 | 
				
			||||||
 | 
					| [`spacy.Mistral.v1`](/api/large-language-models#models-hf)              | Mistral models through HuggingFace.            |
 | 
				
			||||||
| [`spacy.Llama2.v1`](/api/large-language-models#models-hf)               | Llama2 models through HuggingFace.             |
 | 
					| [`spacy.Llama2.v1`](/api/large-language-models#models-hf)               | Llama2 models through HuggingFace.             |
 | 
				
			||||||
| [`spacy.StableLM.v1`](/api/large-language-models#models-hf)             | StableLM models through HuggingFace.           |
 | 
					| [`spacy.StableLM.v1`](/api/large-language-models#models-hf)             | StableLM models through HuggingFace.           |
 | 
				
			||||||
| [`spacy.OpenLLaMA.v1`](/api/large-language-models#models-hf)            | OpenLLaMA models through HuggingFace.          |
 | 
					| [`spacy.OpenLLaMA.v1`](/api/large-language-models#models-hf)            | OpenLLaMA models through HuggingFace.          |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -656,9 +656,9 @@ locally.
 | 
				
			||||||
You can list one or more remotes in the `remotes` section of your
 | 
					You can list one or more remotes in the `remotes` section of your
 | 
				
			||||||
[`project.yml`](#project-yml) by mapping a string name to the URL of the
 | 
					[`project.yml`](#project-yml) by mapping a string name to the URL of the
 | 
				
			||||||
storage. Under the hood, spaCy uses
 | 
					storage. Under the hood, spaCy uses
 | 
				
			||||||
[`Pathy`](https://github.com/justindujardin/pathy) to communicate with the
 | 
					[`cloudpathlib`](https://cloudpathlib.drivendata.org) to communicate with the
 | 
				
			||||||
remote storages, so you can use any protocol that `Pathy` supports, including
 | 
					remote storages, so you can use any protocol that `cloudpathlib` supports,
 | 
				
			||||||
[S3](https://aws.amazon.com/s3/),
 | 
					including [S3](https://aws.amazon.com/s3/),
 | 
				
			||||||
[Google Cloud Storage](https://cloud.google.com/storage), and the local
 | 
					[Google Cloud Storage](https://cloud.google.com/storage), and the local
 | 
				
			||||||
filesystem, although you may need to install extra dependencies to use certain
 | 
					filesystem, although you may need to install extra dependencies to use certain
 | 
				
			||||||
protocols.
 | 
					protocols.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -850,14 +850,14 @@ negative pattern. To keep it simple, we'll either add or subtract `0.1` points 
 | 
				
			||||||
this way, the score will also reflect combinations of emoji, even positive _and_
 | 
					this way, the score will also reflect combinations of emoji, even positive _and_
 | 
				
			||||||
negative ones.
 | 
					negative ones.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
With a library like [Emojipedia](https://github.com/bcongdon/python-emojipedia),
 | 
					With a library like [emoji](https://github.com/carpedm20/emoji), we can also
 | 
				
			||||||
we can also retrieve a short description for each emoji – for example, 😍's
 | 
					retrieve a short description for each emoji – for example, 😍's official title
 | 
				
			||||||
official title is "Smiling Face With Heart-Eyes". Assigning it to a
 | 
					is "Smiling Face With Heart-Eyes". Assigning it to a
 | 
				
			||||||
[custom attribute](/usage/processing-pipelines#custom-components-attributes) on
 | 
					[custom attribute](/usage/processing-pipelines#custom-components-attributes) on
 | 
				
			||||||
the emoji span will make it available as `span._.emoji_desc`.
 | 
					the emoji span will make it available as `span._.emoji_desc`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
from emojipedia import Emojipedia  # Installation: pip install emojipedia
 | 
					import emoji  # Installation: pip install emoji
 | 
				
			||||||
from spacy.tokens import Span  # Get the global Span object
 | 
					from spacy.tokens import Span  # Get the global Span object
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
 | 
					Span.set_extension("emoji_desc", default=None)  # Register the custom attribute
 | 
				
			||||||
| 
						 | 
					@ -869,9 +869,9 @@ def label_sentiment(matcher, doc, i, matches):
 | 
				
			||||||
    elif doc.vocab.strings[match_id] == "SAD":
 | 
					    elif doc.vocab.strings[match_id] == "SAD":
 | 
				
			||||||
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 | 
					        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
 | 
				
			||||||
    span = doc[start:end]
 | 
					    span = doc[start:end]
 | 
				
			||||||
    emoji = Emojipedia.search(span[0].text)  # Get data for emoji
 | 
					    # Verify if it is an emoji and set the extension attribute correctly.
 | 
				
			||||||
    span._.emoji_desc = emoji.title  # Assign emoji description
 | 
					    if emoji.is_emoji(span[0].text):
 | 
				
			||||||
 | 
					        span._.emoji_desc = emoji.demojize(span[0].text, delimiters=("", ""), language=doc.lang_).replace("_", " ")
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To label the hashtags, we can use a
 | 
					To label the hashtags, we can use a
 | 
				
			||||||
| 
						 | 
					@ -1097,7 +1097,7 @@ come directly from
 | 
				
			||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 | 
					[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Symbol                                  | Description                                                                                                                    |
 | 
					| Symbol                                  | Description                                                                                                                    |
 | 
				
			||||||
| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                                         |
 | 
					| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                                         |
 | 
				
			||||||
| `A > B`                                 | `A` is the immediate head of `B`.                                                                                              |
 | 
					| `A > B`                                 | `A` is the immediate head of `B`.                                                                                              |
 | 
				
			||||||
| `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                                        |
 | 
					| `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                                        |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										140
									
								
								website/docs/usage/v3-7.mdx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										140
									
								
								website/docs/usage/v3-7.mdx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,140 @@
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					title: What's New in v3.7
 | 
				
			||||||
 | 
					teaser: New features and how to upgrade
 | 
				
			||||||
 | 
					menu:
 | 
				
			||||||
 | 
					  - ['New Features', 'features']
 | 
				
			||||||
 | 
					  - ['Upgrading Notes', 'upgrading']
 | 
				
			||||||
 | 
					---
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## New features {id="features",hidden="true"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					spaCy v3.7 adds support for Python 3.12, introduces the new standalone library
 | 
				
			||||||
 | 
					[Weasel](https://github.com/explosion/weasel) for project workflows, and updates
 | 
				
			||||||
 | 
					the transformer-based trained pipelines to use our new
 | 
				
			||||||
 | 
					[Curated Transformers](https://github.com/explosion/curated-transformers)
 | 
				
			||||||
 | 
					library.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This release drops support for Python 3.6.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Weasel {id="weasel"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The [spaCy projects](/usage/projects) functionality has been moved into a new
 | 
				
			||||||
 | 
					standalone library [Weasel](https://github.com/explosion/weasel). This brings
 | 
				
			||||||
 | 
					minor changes to spaCy-specific settings in spaCy projects (see
 | 
				
			||||||
 | 
					[upgrading](#upgrading) below), but also makes it possible to use the same
 | 
				
			||||||
 | 
					workflow functionality outside of spaCy.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					All `spacy project` commands should run as before, just now they're using Weasel
 | 
				
			||||||
 | 
					under the hood.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox title="Remote storage for Python 3.12" variant="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Remote storage for spaCy projects is not yet supported for Python 3.12. Use
 | 
				
			||||||
 | 
					Python 3.11 or earlier for remote storage.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Registered vectors {id="custom-vectors"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can specify a custom registered vectors class under `[nlp.vectors]` in order
 | 
				
			||||||
 | 
					to use static vectors in formats other than the ones supported by
 | 
				
			||||||
 | 
					[`Vectors`](/api/vectors). To implement your custom vectors, extend the abstract
 | 
				
			||||||
 | 
					class [`BaseVectors`](/api/basevectors). See an example using
 | 
				
			||||||
 | 
					[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Additional features and improvements {id="additional-features-and-improvements"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Add support for Python 3.12.
 | 
				
			||||||
 | 
					- Extend to Thinc v8.2.
 | 
				
			||||||
 | 
					- Extend `transformers` extra to `spacy-transformers` v1.3.
 | 
				
			||||||
 | 
					- Add `--spans-key` option for CLI evaluation with `spacy benchmark accuracy`.
 | 
				
			||||||
 | 
					- Load the CLI module lazily for `spacy.info`.
 | 
				
			||||||
 | 
					- Add type stubs for for `spacy.training.example`.
 | 
				
			||||||
 | 
					- Warn for unsupported pattern keys in dependency matcher.
 | 
				
			||||||
 | 
					- `Language.replace_listeners`: Pass the replaced listener and the `tok2vec`
 | 
				
			||||||
 | 
					  pipe to the callback in order to support `spacy-curated-transformers`.
 | 
				
			||||||
 | 
					- Always use `tqdm` with `disable=None` in order to disable output in
 | 
				
			||||||
 | 
					  non-interactive environments.
 | 
				
			||||||
 | 
					- Language updates:
 | 
				
			||||||
 | 
					  - Add left and right pointing angle brackets as punctuation to ancient Greek.
 | 
				
			||||||
 | 
					  - Update example sentences for Turkish.
 | 
				
			||||||
 | 
					- Package setup updates:
 | 
				
			||||||
 | 
					  - Update NumPy build constraints for NumPy 1.25+. For Python 3.9+, it is no
 | 
				
			||||||
 | 
					    longer necessary to set build constraints while building binary wheels.
 | 
				
			||||||
 | 
					  - Refactor Cython profiling in order to disable profiling for Python 3.12 in
 | 
				
			||||||
 | 
					    the package setup, since Cython does not currently support profiling for
 | 
				
			||||||
 | 
					    Python 3.12.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Trained pipelines {id="pipelines"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Pipeline updates {id="pipeline-updates"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The transformer-based `trf` pipelines have been updated to use our new
 | 
				
			||||||
 | 
					[Curated Transformers](https://github.com/explosion/curated-transformers)
 | 
				
			||||||
 | 
					library using the Thinc model wrappers and pipeline component from
 | 
				
			||||||
 | 
					[spaCy Curated Transformers](https://github.com/explosion/spacy-curated-transformers).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Notes about upgrading from v3.6 {id="upgrading"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This release drops support for Python 3.6, drops mypy checks for Python 3.7 and
 | 
				
			||||||
 | 
					removes the `ray` extra. In addition there are several minor changes for spaCy
 | 
				
			||||||
 | 
					projects described in the following section.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Backwards incompatibilities for spaCy Projects {id="upgrading-projects"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					`spacy project` has a few backwards incompatibilities due to the transition to
 | 
				
			||||||
 | 
					the standalone library [Weasel](https://github.com/explosion/weasel), which is
 | 
				
			||||||
 | 
					not as tightly coupled to spaCy. Weasel produces warnings when it detects older
 | 
				
			||||||
 | 
					spaCy-specific settings in your environment or project config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- Support for the `spacy_version` configuration key has been dropped.
 | 
				
			||||||
 | 
					- Support for the `check_requirements` configuration key has been dropped due to
 | 
				
			||||||
 | 
					  the deprecation of `pkg_resources`.
 | 
				
			||||||
 | 
					- The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked. You
 | 
				
			||||||
 | 
					  can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`.
 | 
				
			||||||
 | 
					- Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been
 | 
				
			||||||
 | 
					  dropped.
 | 
				
			||||||
 | 
					- Error codes are now Weasel-specific and do not follow spaCy error codes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Pipeline package version compatibility {id="version-compat"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Using legacy implementations
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> In spaCy v3, you'll still be able to load and reference legacy implementations
 | 
				
			||||||
 | 
					> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
 | 
				
			||||||
 | 
					> components or architectures change and newer versions are available in the
 | 
				
			||||||
 | 
					> core library.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When you're loading a pipeline package trained with an earlier version of spaCy
 | 
				
			||||||
 | 
					v3, you will see a warning telling you that the pipeline may be incompatible.
 | 
				
			||||||
 | 
					This doesn't necessarily have to be true, but we recommend running your
 | 
				
			||||||
 | 
					pipelines against your test suite or evaluation data to make sure there are no
 | 
				
			||||||
 | 
					unexpected results.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you're using one of the [trained pipelines](/models) we provide, you should
 | 
				
			||||||
 | 
					run [`spacy download`](/api/cli#download) to update to the latest version. To
 | 
				
			||||||
 | 
					see an overview of all installed packages and their compatibility, you can run
 | 
				
			||||||
 | 
					[`spacy validate`](/api/cli#validate).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you've trained your own custom pipeline and you've confirmed that it's still
 | 
				
			||||||
 | 
					working as expected, you can update the spaCy version requirements in the
 | 
				
			||||||
 | 
					[`meta.json`](/api/data-formats#meta):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```diff
 | 
				
			||||||
 | 
					- "spacy_version": ">=3.6.0,<3.7.0",
 | 
				
			||||||
 | 
					+ "spacy_version": ">=3.6.0,<3.8.0",
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Updating v3.6 configs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					To update a config from spaCy v3.6 with the new v3.7 settings, run
 | 
				
			||||||
 | 
					[`init fill-config`](/api/cli#init-fill-config):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```cli
 | 
				
			||||||
 | 
					$ python -m spacy init fill-config config-v3.6.cfg config-v3.7.cfg
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					In many cases ([`spacy train`](/api/cli#train),
 | 
				
			||||||
 | 
					[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
 | 
				
			||||||
 | 
					automatically, but you'll need to fill in the new settings to run
 | 
				
			||||||
 | 
					[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,8 @@
 | 
				
			||||||
                    { "text": "New in v3.3", "url": "/usage/v3-3" },
 | 
					                    { "text": "New in v3.3", "url": "/usage/v3-3" },
 | 
				
			||||||
                    { "text": "New in v3.4", "url": "/usage/v3-4" },
 | 
					                    { "text": "New in v3.4", "url": "/usage/v3-4" },
 | 
				
			||||||
                    { "text": "New in v3.5", "url": "/usage/v3-5" },
 | 
					                    { "text": "New in v3.5", "url": "/usage/v3-5" },
 | 
				
			||||||
                    { "text": "New in v3.6", "url": "/usage/v3-6" }
 | 
					                    { "text": "New in v3.6", "url": "/usage/v3-6" },
 | 
				
			||||||
 | 
					                    { "text": "New in v3.7", "url": "/usage/v3-7" }
 | 
				
			||||||
                ]
 | 
					                ]
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
| 
						 | 
					@ -100,6 +101,7 @@
 | 
				
			||||||
                "items": [
 | 
					                "items": [
 | 
				
			||||||
                    { "text": "AttributeRuler", "url": "/api/attributeruler" },
 | 
					                    { "text": "AttributeRuler", "url": "/api/attributeruler" },
 | 
				
			||||||
                    { "text": "CoreferenceResolver", "url": "/api/coref" },
 | 
					                    { "text": "CoreferenceResolver", "url": "/api/coref" },
 | 
				
			||||||
 | 
					                    { "text": "CuratedTransformer", "url": "/api/curatedtransformer" },
 | 
				
			||||||
                    { "text": "DependencyParser", "url": "/api/dependencyparser" },
 | 
					                    { "text": "DependencyParser", "url": "/api/dependencyparser" },
 | 
				
			||||||
                    { "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
 | 
					                    { "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
 | 
				
			||||||
                    { "text": "EntityLinker", "url": "/api/entitylinker" },
 | 
					                    { "text": "EntityLinker", "url": "/api/entitylinker" },
 | 
				
			||||||
| 
						 | 
					@ -135,6 +137,7 @@
 | 
				
			||||||
                "label": "Other",
 | 
					                "label": "Other",
 | 
				
			||||||
                "items": [
 | 
					                "items": [
 | 
				
			||||||
                    { "text": "Attributes", "url": "/api/attributes" },
 | 
					                    { "text": "Attributes", "url": "/api/attributes" },
 | 
				
			||||||
 | 
					                    { "text": "BaseVectors", "url": "/api/basevectors" },
 | 
				
			||||||
                    { "text": "Corpus", "url": "/api/corpus" },
 | 
					                    { "text": "Corpus", "url": "/api/corpus" },
 | 
				
			||||||
                    { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
 | 
					                    { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
 | 
				
			||||||
                    { "text": "KnowledgeBase", "url": "/api/kb" },
 | 
					                    { "text": "KnowledgeBase", "url": "/api/kb" },
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user