mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Merge pull request #11973 from essenmitsosse/update-migration-from-master
Update `migration/website` from `master`
This commit is contained in:
		
						commit
						e93952f284
					
				
							
								
								
									
										46
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										46
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -52,17 +52,17 @@ steps: | ||||||
|       python -W error -c "import spacy" |       python -W error -c "import spacy" | ||||||
|     displayName: "Test import" |     displayName: "Test import" | ||||||
| 
 | 
 | ||||||
|   - script: | | #  - script: | | ||||||
|       python -m spacy download ca_core_news_sm | #      python -m spacy download ca_core_news_sm | ||||||
|       python -m spacy download ca_core_news_md | #      python -m spacy download ca_core_news_md | ||||||
|       python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" | #      python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" | ||||||
|     displayName: 'Test download CLI' | #    displayName: 'Test download CLI' | ||||||
|     condition: eq(variables['python_version'], '3.8') | #    condition: eq(variables['python_version'], '3.8') | ||||||
| 
 | # | ||||||
|   - script: | | #  - script: | | ||||||
|       python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" | #      python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" | ||||||
|     displayName: 'Test no warnings on load (#11713)' | #    displayName: 'Test no warnings on load (#11713)' | ||||||
|     condition: eq(variables['python_version'], '3.8') | #    condition: eq(variables['python_version'], '3.8') | ||||||
| 
 | 
 | ||||||
|   - script: | |   - script: | | ||||||
|       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . |       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . | ||||||
|  | @ -86,17 +86,17 @@ steps: | ||||||
|     displayName: 'Test train CLI' |     displayName: 'Test train CLI' | ||||||
|     condition: eq(variables['python_version'], '3.8') |     condition: eq(variables['python_version'], '3.8') | ||||||
| 
 | 
 | ||||||
|   - script: | | #  - script: | | ||||||
|       python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" | #      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" | ||||||
|       PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir | #      PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir | ||||||
|     displayName: 'Test assemble CLI' | #    displayName: 'Test assemble CLI' | ||||||
|     condition: eq(variables['python_version'], '3.8') | #    condition: eq(variables['python_version'], '3.8') | ||||||
| 
 | # | ||||||
|   - script: | | #  - script: | | ||||||
|       python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" | #      python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" | ||||||
|       python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 | #      python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 | ||||||
|     displayName: 'Test assemble CLI vectors warning' | #    displayName: 'Test assemble CLI vectors warning' | ||||||
|     condition: eq(variables['python_version'], '3.8') | #    condition: eq(variables['python_version'], '3.8') | ||||||
| 
 | 
 | ||||||
|   - script: | |   - script: | | ||||||
|       python -m pip install -U -r requirements.txt |       python -m pip install -U -r requirements.txt | ||||||
|  | @ -107,7 +107,7 @@ steps: | ||||||
|     displayName: "Run CPU tests" |     displayName: "Run CPU tests" | ||||||
| 
 | 
 | ||||||
|   - script: | |   - script: | | ||||||
|       python -m pip install --pre thinc-apple-ops |       python -m pip install 'spacy[apple]' | ||||||
|       python -m pytest --pyargs spacy |       python -m pytest --pyargs spacy | ||||||
|     displayName: "Run CPU tests with thinc-apple-ops" |     displayName: "Run CPU tests with thinc-apple-ops" | ||||||
|     condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) |     condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11')) | ||||||
|  |  | ||||||
							
								
								
									
										8
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.github/workflows/lock.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -15,11 +15,11 @@ jobs: | ||||||
|   action: |   action: | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|       - uses: dessant/lock-threads@v3 |       - uses: dessant/lock-threads@v4 | ||||||
|         with: |         with: | ||||||
|           process-only: 'issues' |           process-only: 'issues' | ||||||
|           issue-inactive-days: '30' |           issue-inactive-days: '30' | ||||||
|           issue-comment: >  |           issue-comment: > | ||||||
|             This thread has been automatically locked since there  |             This thread has been automatically locked since there | ||||||
|             has not been any recent activity after it was closed.  |             has not been any recent activity after it was closed. | ||||||
|             Please open a new issue for related bugs. |             Please open a new issue for related bugs. | ||||||
|  |  | ||||||
|  | @ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more, | ||||||
| multi-task learning with pretrained **transformers** like BERT, as well as a | multi-task learning with pretrained **transformers** like BERT, as well as a | ||||||
| production-ready [**training system**](https://spacy.io/usage/training) and easy | production-ready [**training system**](https://spacy.io/usage/training) and easy | ||||||
| model packaging, deployment and workflow management. spaCy is commercial | model packaging, deployment and workflow management. spaCy is commercial | ||||||
| open-source software, released under the MIT license. | open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). | ||||||
| 
 | 
 | ||||||
| 💫 **Version 3.4 out now!** | 💫 **Version 3.4 out now!** | ||||||
| [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | ||||||
|  | @ -46,6 +46,7 @@ open-source software, released under the MIT license. | ||||||
| | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 | | | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                 | | ||||||
| | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        | | | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                        | | ||||||
| | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** | | | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** | | ||||||
|  | | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** | | ||||||
| 
 | 
 | ||||||
| [spacy 101]: https://spacy.io/usage/spacy-101 | [spacy 101]: https://spacy.io/usage/spacy-101 | ||||||
| [new in v3.0]: https://spacy.io/usage/v3 | [new in v3.0]: https://spacy.io/usage/v3 | ||||||
|  | @ -59,6 +60,7 @@ open-source software, released under the MIT license. | ||||||
| [changelog]: https://spacy.io/usage#changelog | [changelog]: https://spacy.io/usage#changelog | ||||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| ## 💬 Where to ask questions | ## 💬 Where to ask questions | ||||||
| 
 | 
 | ||||||
| The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). | The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). | ||||||
|  |  | ||||||
|  | @ -41,7 +41,7 @@ jobs: | ||||||
|       matrix: |       matrix: | ||||||
|         # We're only running one platform per Python version to speed up builds |         # We're only running one platform per Python version to speed up builds | ||||||
|         Python36Linux: |         Python36Linux: | ||||||
|           imageName: "ubuntu-latest" |           imageName: "ubuntu-20.04" | ||||||
|           python.version: "3.6" |           python.version: "3.6" | ||||||
|         #        Python36Windows: |         #        Python36Windows: | ||||||
|         #          imageName: "windows-latest" |         #          imageName: "windows-latest" | ||||||
|  | @ -50,7 +50,7 @@ jobs: | ||||||
|         #          imageName: "macos-latest" |         #          imageName: "macos-latest" | ||||||
|         #          python.version: "3.6" |         #          python.version: "3.6" | ||||||
|         #        Python37Linux: |         #        Python37Linux: | ||||||
|         #          imageName: "ubuntu-latest" |         #          imageName: "ubuntu-20.04" | ||||||
|         #          python.version: "3.7" |         #          python.version: "3.7" | ||||||
|         Python37Windows: |         Python37Windows: | ||||||
|           imageName: "windows-latest" |           imageName: "windows-latest" | ||||||
|  |  | ||||||
|  | @ -6,11 +6,11 @@ preshed>=3.0.2,<3.1.0 | ||||||
| thinc>=8.1.0,<8.2.0 | thinc>=8.1.0,<8.2.0 | ||||||
| ml_datasets>=0.2.0,<0.3.0 | ml_datasets>=0.2.0,<0.3.0 | ||||||
| murmurhash>=0.28.0,<1.1.0 | murmurhash>=0.28.0,<1.1.0 | ||||||
| wasabi>=0.9.1,<1.1.0 | wasabi>=0.9.1,<1.2.0 | ||||||
| srsly>=2.4.3,<3.0.0 | srsly>=2.4.3,<3.0.0 | ||||||
| catalogue>=2.0.6,<2.1.0 | catalogue>=2.0.6,<2.1.0 | ||||||
| typer>=0.3.0,<0.8.0 | typer>=0.3.0,<0.8.0 | ||||||
| pathy>=0.3.5 | pathy>=0.10.0 | ||||||
| smart-open>=5.2.1,<7.0.0 | smart-open>=5.2.1,<7.0.0 | ||||||
| # Third party dependencies | # Third party dependencies | ||||||
| numpy>=1.15.0 | numpy>=1.15.0 | ||||||
|  |  | ||||||
|  | @ -47,12 +47,12 @@ install_requires = | ||||||
|     cymem>=2.0.2,<2.1.0 |     cymem>=2.0.2,<2.1.0 | ||||||
|     preshed>=3.0.2,<3.1.0 |     preshed>=3.0.2,<3.1.0 | ||||||
|     thinc>=8.1.0,<8.2.0 |     thinc>=8.1.0,<8.2.0 | ||||||
|     wasabi>=0.9.1,<1.1.0 |     wasabi>=0.9.1,<1.2.0 | ||||||
|     srsly>=2.4.3,<3.0.0 |     srsly>=2.4.3,<3.0.0 | ||||||
|     catalogue>=2.0.6,<2.1.0 |     catalogue>=2.0.6,<2.1.0 | ||||||
|     # Third-party dependencies |     # Third-party dependencies | ||||||
|     typer>=0.3.0,<0.8.0 |     typer>=0.3.0,<0.8.0 | ||||||
|     pathy>=0.3.5 |     pathy>=0.10.0 | ||||||
|     smart-open>=5.2.1,<7.0.0 |     smart-open>=5.2.1,<7.0.0 | ||||||
|     tqdm>=4.38.0,<5.0.0 |     tqdm>=4.38.0,<5.0.0 | ||||||
|     numpy>=1.15.0 |     numpy>=1.15.0 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| # fmt: off | # fmt: off | ||||||
| __title__ = "spacy" | __title__ = "spacy" | ||||||
| __version__ = "3.4.2" | __version__ = "3.5.0" | ||||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||||
| __projects__ = "https://github.com/explosion/projects" | __projects__ = "https://github.com/explosion/projects" | ||||||
|  |  | ||||||
|  | @ -23,7 +23,7 @@ from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS | ||||||
| from .. import about | from .. import about | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from pathy import Pathy  # noqa: F401 |     from pathy import FluidPath  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| SDIST_SUFFIX = ".tar.gz" | SDIST_SUFFIX = ".tar.gz" | ||||||
|  | @ -158,15 +158,15 @@ def load_project_config( | ||||||
|         sys.exit(1) |         sys.exit(1) | ||||||
|     validate_project_version(config) |     validate_project_version(config) | ||||||
|     validate_project_commands(config) |     validate_project_commands(config) | ||||||
|  |     if interpolate: | ||||||
|  |         err = f"{PROJECT_FILE} validation error" | ||||||
|  |         with show_validation_error(title=err, hint_fill=False): | ||||||
|  |             config = substitute_project_variables(config, overrides) | ||||||
|     # Make sure directories defined in config exist |     # Make sure directories defined in config exist | ||||||
|     for subdir in config.get("directories", []): |     for subdir in config.get("directories", []): | ||||||
|         dir_path = path / subdir |         dir_path = path / subdir | ||||||
|         if not dir_path.exists(): |         if not dir_path.exists(): | ||||||
|             dir_path.mkdir(parents=True) |             dir_path.mkdir(parents=True) | ||||||
|     if interpolate: |  | ||||||
|         err = f"{PROJECT_FILE} validation error" |  | ||||||
|         with show_validation_error(title=err, hint_fill=False): |  | ||||||
|             config = substitute_project_variables(config, overrides) |  | ||||||
|     return config |     return config | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -331,7 +331,7 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: | ||||||
|             msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) |             msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: | def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None: | ||||||
|     """Upload a file. |     """Upload a file. | ||||||
| 
 | 
 | ||||||
|     src (Path): The source path. |     src (Path): The source path. | ||||||
|  | @ -339,13 +339,20 @@ def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: | ||||||
|     """ |     """ | ||||||
|     import smart_open |     import smart_open | ||||||
| 
 | 
 | ||||||
|  |     # Create parent directories for local paths | ||||||
|  |     if isinstance(dest, Path): | ||||||
|  |         if not dest.parent.exists(): | ||||||
|  |             dest.parent.mkdir(parents=True) | ||||||
|  | 
 | ||||||
|     dest = str(dest) |     dest = str(dest) | ||||||
|     with smart_open.open(dest, mode="wb") as output_file: |     with smart_open.open(dest, mode="wb") as output_file: | ||||||
|         with src.open(mode="rb") as input_file: |         with src.open(mode="rb") as input_file: | ||||||
|             output_file.write(input_file.read()) |             output_file.write(input_file.read()) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: | def download_file( | ||||||
|  |     src: Union[str, "FluidPath"], dest: Path, *, force: bool = False | ||||||
|  | ) -> None: | ||||||
|     """Download a file using smart_open. |     """Download a file using smart_open. | ||||||
| 
 | 
 | ||||||
|     url (str): The URL of the file. |     url (str): The URL of the file. | ||||||
|  | @ -368,7 +375,7 @@ def ensure_pathy(path): | ||||||
|     slow and annoying Google Cloud warning).""" |     slow and annoying Google Cloud warning).""" | ||||||
|     from pathy import Pathy  # noqa: F811 |     from pathy import Pathy  # noqa: F811 | ||||||
| 
 | 
 | ||||||
|     return Pathy(path) |     return Pathy.fluid(path) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def git_checkout( | def git_checkout( | ||||||
|  |  | ||||||
|  | @ -13,6 +13,7 @@ from ._util import import_code, debug_cli, _format_number | ||||||
| from ..training import Example, remove_bilu_prefix | from ..training import Example, remove_bilu_prefix | ||||||
| from ..training.initialize import get_sourced_components | from ..training.initialize import get_sourced_components | ||||||
| from ..schemas import ConfigSchemaTraining | from ..schemas import ConfigSchemaTraining | ||||||
|  | from ..pipeline import TrainablePipe | ||||||
| from ..pipeline._parser_internals import nonproj | from ..pipeline._parser_internals import nonproj | ||||||
| from ..pipeline._parser_internals.nonproj import DELIMITER | from ..pipeline._parser_internals.nonproj import DELIMITER | ||||||
| from ..pipeline import Morphologizer, SpanCategorizer | from ..pipeline import Morphologizer, SpanCategorizer | ||||||
|  | @ -934,6 +935,7 @@ def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]: | ||||||
|     labels: Set[str] = set() |     labels: Set[str] = set() | ||||||
|     for pipe_name in pipe_names: |     for pipe_name in pipe_names: | ||||||
|         pipe = nlp.get_pipe(pipe_name) |         pipe = nlp.get_pipe(pipe_name) | ||||||
|  |         assert isinstance(pipe, TrainablePipe) | ||||||
|         labels.update(pipe.labels) |         labels.update(pipe.labels) | ||||||
|     return labels |     return labels | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -5,15 +5,17 @@ import hashlib | ||||||
| import urllib.parse | import urllib.parse | ||||||
| import tarfile | import tarfile | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from .._util import get_hash, get_checksum, download_file, ensure_pathy | from .._util import get_hash, get_checksum, upload_file, download_file | ||||||
| from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var | from .._util import ensure_pathy, make_tempdir | ||||||
|  | from ...util import get_minor_version, ENV_VARS, check_bool_env_var | ||||||
| from ...git_info import GIT_VERSION | from ...git_info import GIT_VERSION | ||||||
| from ... import about | from ... import about | ||||||
| from ...errors import Errors | from ...errors import Errors | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from pathy import Pathy  # noqa: F401 |     from pathy import FluidPath  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class RemoteStorage: | class RemoteStorage: | ||||||
|  | @ -28,7 +30,7 @@ class RemoteStorage: | ||||||
|         self.url = ensure_pathy(url) |         self.url = ensure_pathy(url) | ||||||
|         self.compression = compression |         self.compression = compression | ||||||
| 
 | 
 | ||||||
|     def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": |     def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": | ||||||
|         """Compress a file or directory within a project and upload it to a remote |         """Compress a file or directory within a project and upload it to a remote | ||||||
|         storage. If an object exists at the full URL, nothing is done. |         storage. If an object exists at the full URL, nothing is done. | ||||||
| 
 | 
 | ||||||
|  | @ -49,9 +51,7 @@ class RemoteStorage: | ||||||
|             mode_string = f"w:{self.compression}" if self.compression else "w" |             mode_string = f"w:{self.compression}" if self.compression else "w" | ||||||
|             with tarfile.open(tar_loc, mode=mode_string) as tar_file: |             with tarfile.open(tar_loc, mode=mode_string) as tar_file: | ||||||
|                 tar_file.add(str(loc), arcname=str(path)) |                 tar_file.add(str(loc), arcname=str(path)) | ||||||
|             with tar_loc.open(mode="rb") as input_file: |             upload_file(tar_loc, url) | ||||||
|                 with url.open(mode="wb") as output_file: |  | ||||||
|                     output_file.write(input_file.read()) |  | ||||||
|         return url |         return url | ||||||
| 
 | 
 | ||||||
|     def pull( |     def pull( | ||||||
|  | @ -60,7 +60,7 @@ class RemoteStorage: | ||||||
|         *, |         *, | ||||||
|         command_hash: Optional[str] = None, |         command_hash: Optional[str] = None, | ||||||
|         content_hash: Optional[str] = None, |         content_hash: Optional[str] = None, | ||||||
|     ) -> Optional["Pathy"]: |     ) -> Optional["FluidPath"]: | ||||||
|         """Retrieve a file from the remote cache. If the file already exists, |         """Retrieve a file from the remote cache. If the file already exists, | ||||||
|         nothing is done. |         nothing is done. | ||||||
| 
 | 
 | ||||||
|  | @ -110,25 +110,37 @@ class RemoteStorage: | ||||||
|         *, |         *, | ||||||
|         command_hash: Optional[str] = None, |         command_hash: Optional[str] = None, | ||||||
|         content_hash: Optional[str] = None, |         content_hash: Optional[str] = None, | ||||||
|     ) -> Optional["Pathy"]: |     ) -> Optional["FluidPath"]: | ||||||
|         """Find the best matching version of a file within the storage, |         """Find the best matching version of a file within the storage, | ||||||
|         or `None` if no match can be found. If both the creation and content hash |         or `None` if no match can be found. If both the creation and content hash | ||||||
|         are specified, only exact matches will be returned. Otherwise, the most |         are specified, only exact matches will be returned. Otherwise, the most | ||||||
|         recent matching file is preferred. |         recent matching file is preferred. | ||||||
|         """ |         """ | ||||||
|         name = self.encode_name(str(path)) |         name = self.encode_name(str(path)) | ||||||
|  |         urls = [] | ||||||
|         if command_hash is not None and content_hash is not None: |         if command_hash is not None and content_hash is not None: | ||||||
|             url = self.make_url(path, command_hash, content_hash) |             url = self.url / name / command_hash / content_hash | ||||||
|             urls = [url] if url.exists() else [] |             urls = [url] if url.exists() else [] | ||||||
|         elif command_hash is not None: |         elif command_hash is not None: | ||||||
|             urls = list((self.url / name / command_hash).iterdir()) |             if (self.url / name / command_hash).exists(): | ||||||
|  |                 urls = list((self.url / name / command_hash).iterdir()) | ||||||
|         else: |         else: | ||||||
|             urls = list((self.url / name).iterdir()) |             if (self.url / name).exists(): | ||||||
|             if content_hash is not None: |                 for sub_dir in (self.url / name).iterdir(): | ||||||
|                 urls = [url for url in urls if url.parts[-1] == content_hash] |                     urls.extend(sub_dir.iterdir()) | ||||||
|  |                 if content_hash is not None: | ||||||
|  |                     urls = [url for url in urls if url.parts[-1] == content_hash] | ||||||
|  |         if len(urls) >= 2: | ||||||
|  |             try: | ||||||
|  |                 urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore | ||||||
|  |             except Exception: | ||||||
|  |                 msg.warn( | ||||||
|  |                     "Unable to sort remote files by last modified. The file(s) " | ||||||
|  |                     "pulled from the cache may not be the most recent." | ||||||
|  |                 ) | ||||||
|         return urls[-1] if urls else None |         return urls[-1] if urls else None | ||||||
| 
 | 
 | ||||||
|     def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": |     def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": | ||||||
|         """Construct a URL from a subpath, a creation hash and a content hash.""" |         """Construct a URL from a subpath, a creation hash and a content hash.""" | ||||||
|         return self.url / self.encode_name(str(path)) / command_hash / content_hash |         return self.url / self.encode_name(str(path)) / command_hash / content_hash | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -101,8 +101,8 @@ def project_run( | ||||||
|             if not (project_dir / dep).exists(): |             if not (project_dir / dep).exists(): | ||||||
|                 err = f"Missing dependency specified by command '{subcommand}': {dep}" |                 err = f"Missing dependency specified by command '{subcommand}': {dep}" | ||||||
|                 err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" |                 err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" | ||||||
|                 err_kwargs = {"exits": 1} if not dry else {} |                 err_exits = 1 if not dry else None | ||||||
|                 msg.fail(err, err_help, **err_kwargs) |                 msg.fail(err, err_help, exits=err_exits) | ||||||
|         check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) |         check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) | ||||||
|         with working_dir(project_dir) as current_dir: |         with working_dir(project_dir) as current_dir: | ||||||
|             msg.divider(subcommand) |             msg.divider(subcommand) | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| {# This is a template for training configs used for the quickstart widget in | {# This is a template for training configs used for the quickstart widget in | ||||||
| the docs and the init config command. It encodes various best practices and | the docs and the init config command. It encodes various best practices and | ||||||
| can help generate the best possible configuration, given a user's requirements. #} | can help generate the best possible configuration, given a user's requirements. #} | ||||||
| {%- set use_transformer = hardware != "cpu" -%} | {%- set use_transformer = hardware != "cpu" and transformer_data -%} | ||||||
| {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} | {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} | ||||||
| {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} | {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} | ||||||
| [paths] | [paths] | ||||||
|  |  | ||||||
|  | @ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|             "clear the existing vectors and resize the table.") |             "clear the existing vectors and resize the table.") | ||||||
|     E074 = ("Error interpreting compiled match pattern: patterns are expected " |     E074 = ("Error interpreting compiled match pattern: patterns are expected " | ||||||
|             "to end with the attribute {attr}. Got: {bad_attr}.") |             "to end with the attribute {attr}. Got: {bad_attr}.") | ||||||
|  |     E079 = ("Error computing states in beam: number of predicted beams " | ||||||
|  |             "({pbeams}) does not equal number of gold beams ({gbeams}).") | ||||||
|  |     E080 = ("Duplicate state found in beam: {key}.") | ||||||
|  |     E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " | ||||||
|  |             "does not equal number of losses ({losses}).") | ||||||
|     E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " |     E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " | ||||||
|             "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " |             "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " | ||||||
|             "match.") |             "match.") | ||||||
|  |  | ||||||
|  | @ -43,8 +43,7 @@ from .lookups import load_lookups | ||||||
| from .compat import Literal | from .compat import Literal | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | PipeCallable = Callable[[Doc], Doc] | ||||||
|     from .pipeline import Pipe  # noqa: F401 |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # This is the base config will all settings (training etc.) | # This is the base config will all settings (training etc.) | ||||||
|  | @ -181,7 +180,7 @@ class Language: | ||||||
|         self.vocab: Vocab = vocab |         self.vocab: Vocab = vocab | ||||||
|         if self.lang is None: |         if self.lang is None: | ||||||
|             self.lang = self.vocab.lang |             self.lang = self.vocab.lang | ||||||
|         self._components: List[Tuple[str, "Pipe"]] = [] |         self._components: List[Tuple[str, PipeCallable]] = [] | ||||||
|         self._disabled: Set[str] = set() |         self._disabled: Set[str] = set() | ||||||
|         self.max_length = max_length |         self.max_length = max_length | ||||||
|         # Create the default tokenizer from the default config |         # Create the default tokenizer from the default config | ||||||
|  | @ -303,7 +302,7 @@ class Language: | ||||||
|         return SimpleFrozenList(names) |         return SimpleFrozenList(names) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def components(self) -> List[Tuple[str, "Pipe"]]: |     def components(self) -> List[Tuple[str, PipeCallable]]: | ||||||
|         """Get all (name, component) tuples in the pipeline, including the |         """Get all (name, component) tuples in the pipeline, including the | ||||||
|         currently disabled components. |         currently disabled components. | ||||||
|         """ |         """ | ||||||
|  | @ -322,12 +321,12 @@ class Language: | ||||||
|         return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names")) |         return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names")) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def pipeline(self) -> List[Tuple[str, "Pipe"]]: |     def pipeline(self) -> List[Tuple[str, PipeCallable]]: | ||||||
|         """The processing pipeline consisting of (name, component) tuples. The |         """The processing pipeline consisting of (name, component) tuples. The | ||||||
|         components are called on the Doc in order as it passes through the |         components are called on the Doc in order as it passes through the | ||||||
|         pipeline. |         pipeline. | ||||||
| 
 | 
 | ||||||
|         RETURNS (List[Tuple[str, Pipe]]): The pipeline. |         RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline. | ||||||
|         """ |         """ | ||||||
|         pipes = [(n, p) for n, p in self._components if n not in self._disabled] |         pipes = [(n, p) for n, p in self._components if n not in self._disabled] | ||||||
|         return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline")) |         return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline")) | ||||||
|  | @ -527,7 +526,7 @@ class Language: | ||||||
|         assigns: Iterable[str] = SimpleFrozenList(), |         assigns: Iterable[str] = SimpleFrozenList(), | ||||||
|         requires: Iterable[str] = SimpleFrozenList(), |         requires: Iterable[str] = SimpleFrozenList(), | ||||||
|         retokenizes: bool = False, |         retokenizes: bool = False, | ||||||
|         func: Optional["Pipe"] = None, |         func: Optional[PipeCallable] = None, | ||||||
|     ) -> Callable[..., Any]: |     ) -> Callable[..., Any]: | ||||||
|         """Register a new pipeline component. Can be used for stateless function |         """Register a new pipeline component. Can be used for stateless function | ||||||
|         components that don't require a separate factory. Can be used as a |         components that don't require a separate factory. Can be used as a | ||||||
|  | @ -542,7 +541,7 @@ class Language: | ||||||
|             e.g. "token.ent_id". Used for pipeline analysis. |             e.g. "token.ent_id". Used for pipeline analysis. | ||||||
|         retokenizes (bool): Whether the component changes the tokenization. |         retokenizes (bool): Whether the component changes the tokenization. | ||||||
|             Used for pipeline analysis. |             Used for pipeline analysis. | ||||||
|         func (Optional[Callable]): Factory function if not used as a decorator. |         func (Optional[Callable[[Doc], Doc]): Factory function if not used as a decorator. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/language#component |         DOCS: https://spacy.io/api/language#component | ||||||
|         """ |         """ | ||||||
|  | @ -553,11 +552,11 @@ class Language: | ||||||
|                 raise ValueError(Errors.E853.format(name=name)) |                 raise ValueError(Errors.E853.format(name=name)) | ||||||
|         component_name = name if name is not None else util.get_object_name(func) |         component_name = name if name is not None else util.get_object_name(func) | ||||||
| 
 | 
 | ||||||
|         def add_component(component_func: "Pipe") -> Callable: |         def add_component(component_func: PipeCallable) -> Callable: | ||||||
|             if isinstance(func, type):  # function is a class |             if isinstance(func, type):  # function is a class | ||||||
|                 raise ValueError(Errors.E965.format(name=component_name)) |                 raise ValueError(Errors.E965.format(name=component_name)) | ||||||
| 
 | 
 | ||||||
|             def factory_func(nlp, name: str) -> "Pipe": |             def factory_func(nlp, name: str) -> PipeCallable: | ||||||
|                 return component_func |                 return component_func | ||||||
| 
 | 
 | ||||||
|             internal_name = cls.get_factory_name(name) |             internal_name = cls.get_factory_name(name) | ||||||
|  | @ -607,7 +606,7 @@ class Language: | ||||||
|             print_pipe_analysis(analysis, keys=keys) |             print_pipe_analysis(analysis, keys=keys) | ||||||
|         return analysis |         return analysis | ||||||
| 
 | 
 | ||||||
|     def get_pipe(self, name: str) -> "Pipe": |     def get_pipe(self, name: str) -> PipeCallable: | ||||||
|         """Get a pipeline component for a given component name. |         """Get a pipeline component for a given component name. | ||||||
| 
 | 
 | ||||||
|         name (str): Name of pipeline component to get. |         name (str): Name of pipeline component to get. | ||||||
|  | @ -628,7 +627,7 @@ class Language: | ||||||
|         config: Dict[str, Any] = SimpleFrozenDict(), |         config: Dict[str, Any] = SimpleFrozenDict(), | ||||||
|         raw_config: Optional[Config] = None, |         raw_config: Optional[Config] = None, | ||||||
|         validate: bool = True, |         validate: bool = True, | ||||||
|     ) -> "Pipe": |     ) -> PipeCallable: | ||||||
|         """Create a pipeline component. Mostly used internally. To create and |         """Create a pipeline component. Mostly used internally. To create and | ||||||
|         add a component to the pipeline, you can use nlp.add_pipe. |         add a component to the pipeline, you can use nlp.add_pipe. | ||||||
| 
 | 
 | ||||||
|  | @ -640,7 +639,7 @@ class Language: | ||||||
|         raw_config (Optional[Config]): Internals: the non-interpolated config. |         raw_config (Optional[Config]): Internals: the non-interpolated config. | ||||||
|         validate (bool): Whether to validate the component config against the |         validate (bool): Whether to validate the component config against the | ||||||
|             arguments and types expected by the factory. |             arguments and types expected by the factory. | ||||||
|         RETURNS (Pipe): The pipeline component. |         RETURNS (Callable[[Doc], Doc]): The pipeline component. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/language#create_pipe |         DOCS: https://spacy.io/api/language#create_pipe | ||||||
|         """ |         """ | ||||||
|  | @ -695,13 +694,13 @@ class Language: | ||||||
| 
 | 
 | ||||||
|     def create_pipe_from_source( |     def create_pipe_from_source( | ||||||
|         self, source_name: str, source: "Language", *, name: str |         self, source_name: str, source: "Language", *, name: str | ||||||
|     ) -> Tuple["Pipe", str]: |     ) -> Tuple[PipeCallable, str]: | ||||||
|         """Create a pipeline component by copying it from an existing model. |         """Create a pipeline component by copying it from an existing model. | ||||||
| 
 | 
 | ||||||
|         source_name (str): Name of the component in the source pipeline. |         source_name (str): Name of the component in the source pipeline. | ||||||
|         source (Language): The source nlp object to copy from. |         source (Language): The source nlp object to copy from. | ||||||
|         name (str): Optional alternative name to use in current pipeline. |         name (str): Optional alternative name to use in current pipeline. | ||||||
|         RETURNS (Tuple[Callable, str]): The component and its factory name. |         RETURNS (Tuple[Callable[[Doc], Doc], str]): The component and its factory name. | ||||||
|         """ |         """ | ||||||
|         # Check source type |         # Check source type | ||||||
|         if not isinstance(source, Language): |         if not isinstance(source, Language): | ||||||
|  | @ -740,7 +739,7 @@ class Language: | ||||||
|         config: Dict[str, Any] = SimpleFrozenDict(), |         config: Dict[str, Any] = SimpleFrozenDict(), | ||||||
|         raw_config: Optional[Config] = None, |         raw_config: Optional[Config] = None, | ||||||
|         validate: bool = True, |         validate: bool = True, | ||||||
|     ) -> "Pipe": |     ) -> PipeCallable: | ||||||
|         """Add a component to the processing pipeline. Valid components are |         """Add a component to the processing pipeline. Valid components are | ||||||
|         callables that take a `Doc` object, modify it and return it. Only one |         callables that take a `Doc` object, modify it and return it. Only one | ||||||
|         of before/after/first/last can be set. Default behaviour is "last". |         of before/after/first/last can be set. Default behaviour is "last". | ||||||
|  | @ -763,7 +762,7 @@ class Language: | ||||||
|         raw_config (Optional[Config]): Internals: the non-interpolated config. |         raw_config (Optional[Config]): Internals: the non-interpolated config. | ||||||
|         validate (bool): Whether to validate the component config against the |         validate (bool): Whether to validate the component config against the | ||||||
|             arguments and types expected by the factory. |             arguments and types expected by the factory. | ||||||
|         RETURNS (Pipe): The pipeline component. |         RETURNS (Callable[[Doc], Doc]): The pipeline component. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/language#add_pipe |         DOCS: https://spacy.io/api/language#add_pipe | ||||||
|         """ |         """ | ||||||
|  | @ -869,7 +868,7 @@ class Language: | ||||||
|         *, |         *, | ||||||
|         config: Dict[str, Any] = SimpleFrozenDict(), |         config: Dict[str, Any] = SimpleFrozenDict(), | ||||||
|         validate: bool = True, |         validate: bool = True, | ||||||
|     ) -> "Pipe": |     ) -> PipeCallable: | ||||||
|         """Replace a component in the pipeline. |         """Replace a component in the pipeline. | ||||||
| 
 | 
 | ||||||
|         name (str): Name of the component to replace. |         name (str): Name of the component to replace. | ||||||
|  | @ -878,7 +877,7 @@ class Language: | ||||||
|             component. Will be merged with default config, if available. |             component. Will be merged with default config, if available. | ||||||
|         validate (bool): Whether to validate the component config against the |         validate (bool): Whether to validate the component config against the | ||||||
|             arguments and types expected by the factory. |             arguments and types expected by the factory. | ||||||
|         RETURNS (Pipe): The new pipeline component. |         RETURNS (Callable[[Doc], Doc]): The new pipeline component. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/language#replace_pipe |         DOCS: https://spacy.io/api/language#replace_pipe | ||||||
|         """ |         """ | ||||||
|  | @ -930,11 +929,11 @@ class Language: | ||||||
|             init_cfg = self._config["initialize"]["components"].pop(old_name) |             init_cfg = self._config["initialize"]["components"].pop(old_name) | ||||||
|             self._config["initialize"]["components"][new_name] = init_cfg |             self._config["initialize"]["components"][new_name] = init_cfg | ||||||
| 
 | 
 | ||||||
|     def remove_pipe(self, name: str) -> Tuple[str, "Pipe"]: |     def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]: | ||||||
|         """Remove a component from the pipeline. |         """Remove a component from the pipeline. | ||||||
| 
 | 
 | ||||||
|         name (str): Name of the component to remove. |         name (str): Name of the component to remove. | ||||||
|         RETURNS (tuple): A `(name, component)` tuple of the removed component. |         RETURNS (Tuple[str, Callable[[Doc], Doc]]): A `(name, component)` tuple of the removed component. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/language#remove_pipe |         DOCS: https://spacy.io/api/language#remove_pipe | ||||||
|         """ |         """ | ||||||
|  | @ -1349,15 +1348,15 @@ class Language: | ||||||
| 
 | 
 | ||||||
|     def set_error_handler( |     def set_error_handler( | ||||||
|         self, |         self, | ||||||
|         error_handler: Callable[[str, "Pipe", List[Doc], Exception], NoReturn], |         error_handler: Callable[[str, PipeCallable, List[Doc], Exception], NoReturn], | ||||||
|     ): |     ): | ||||||
|         """Set an error handler object for all the components in the pipeline that implement |         """Set an error handler object for all the components in the pipeline | ||||||
|         a set_error_handler function. |         that implement a set_error_handler function. | ||||||
| 
 | 
 | ||||||
|         error_handler (Callable[[str, Pipe, List[Doc], Exception], NoReturn]): |         error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], NoReturn]): | ||||||
|             Function that deals with a failing batch of documents. This callable function should take in |             Function that deals with a failing batch of documents. This callable | ||||||
|             the component's name, the component itself, the offending batch of documents, and the exception |             function should take in the component's name, the component itself, | ||||||
|             that was thrown. |             the offending batch of documents, and the exception that was thrown. | ||||||
|         DOCS: https://spacy.io/api/language#set_error_handler |         DOCS: https://spacy.io/api/language#set_error_handler | ||||||
|         """ |         """ | ||||||
|         self.default_error_handler = error_handler |         self.default_error_handler = error_handler | ||||||
|  |  | ||||||
|  | @ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe): | ||||||
| 
 | 
 | ||||||
|             tree = dict(tree) |             tree = dict(tree) | ||||||
|             if "orig" in tree: |             if "orig" in tree: | ||||||
|                 tree["orig"] = self.vocab.strings[tree["orig"]] |                 tree["orig"] = self.vocab.strings.add(tree["orig"]) | ||||||
|             if "orig" in tree: |             if "orig" in tree: | ||||||
|                 tree["subst"] = self.vocab.strings[tree["subst"]] |                 tree["subst"] = self.vocab.strings.add(tree["subst"]) | ||||||
| 
 | 
 | ||||||
|             trees.append(tree) |             trees.append(tree) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe): | ||||||
|         DOCS: https://spacy.io/api/spancategorizer#predict |         DOCS: https://spacy.io/api/spancategorizer#predict | ||||||
|         """ |         """ | ||||||
|         indices = self.suggester(docs, ops=self.model.ops) |         indices = self.suggester(docs, ops=self.model.ops) | ||||||
|         scores = self.model.predict((docs, indices))  # type: ignore |         if indices.lengths.sum() == 0: | ||||||
|  |             scores = self.model.ops.alloc2f(0, 0) | ||||||
|  |         else: | ||||||
|  |             scores = self.model.predict((docs, indices))  # type: ignore | ||||||
|         return indices, scores |         return indices, scores | ||||||
| 
 | 
 | ||||||
|     def set_candidates( |     def set_candidates( | ||||||
|  |  | ||||||
|  | @ -87,7 +87,6 @@ subword_features = true | ||||||
|         "cats_macro_f": None, |         "cats_macro_f": None, | ||||||
|         "cats_macro_auc": None, |         "cats_macro_auc": None, | ||||||
|         "cats_f_per_type": None, |         "cats_f_per_type": None, | ||||||
|         "cats_macro_auc_per_type": None, |  | ||||||
|     }, |     }, | ||||||
| ) | ) | ||||||
| def make_textcat( | def make_textcat( | ||||||
|  |  | ||||||
|  | @ -87,7 +87,6 @@ subword_features = true | ||||||
|         "cats_macro_f": None, |         "cats_macro_f": None, | ||||||
|         "cats_macro_auc": None, |         "cats_macro_auc": None, | ||||||
|         "cats_f_per_type": None, |         "cats_f_per_type": None, | ||||||
|         "cats_macro_auc_per_type": None, |  | ||||||
|     }, |     }, | ||||||
| ) | ) | ||||||
| def make_multilabel_textcat( | def make_multilabel_textcat( | ||||||
|  |  | ||||||
|  | @ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab): | ||||||
| 
 | 
 | ||||||
|     # head before start |     # head before start | ||||||
|     arr = doc.to_array(["HEAD"]) |     arr = doc.to_array(["HEAD"]) | ||||||
|     arr[0] = -1 |     arr[0] = numpy.int32(-1).astype(numpy.uint64) | ||||||
|     doc_from_array = Doc(en_vocab, words=words) |     doc_from_array = Doc(en_vocab, words=words) | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         doc_from_array.from_array(["HEAD"], arr) |         doc_from_array.from_array(["HEAD"], arr) | ||||||
| 
 | 
 | ||||||
|     # head after end |     # head after end | ||||||
|     arr = doc.to_array(["HEAD"]) |     arr = doc.to_array(["HEAD"]) | ||||||
|     arr[0] = 5 |     arr[0] = numpy.int32(5).astype(numpy.uint64) | ||||||
|     doc_from_array = Doc(en_vocab, words=words) |     doc_from_array = Doc(en_vocab, words=words) | ||||||
|     with pytest.raises(ValueError): |     with pytest.raises(ValueError): | ||||||
|         doc_from_array.from_array(["HEAD"], arr) |         doc_from_array.from_array(["HEAD"], arr) | ||||||
|  |  | ||||||
|  | @ -60,10 +60,45 @@ def test_initialize_from_labels(): | ||||||
|     nlp2 = Language() |     nlp2 = Language() | ||||||
|     lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer") |     lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer") | ||||||
|     lemmatizer2.initialize( |     lemmatizer2.initialize( | ||||||
|         get_examples=lambda: train_examples, |         # We want to check that the strings in replacement nodes are | ||||||
|  |         # added to the string store. Avoid that they get added through | ||||||
|  |         # the examples. | ||||||
|  |         get_examples=lambda: train_examples[:1], | ||||||
|         labels=lemmatizer.label_data, |         labels=lemmatizer.label_data, | ||||||
|     ) |     ) | ||||||
|     assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3} |     assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3} | ||||||
|  |     assert lemmatizer2.label_data == { | ||||||
|  |         "trees": [ | ||||||
|  |             {"orig": "S", "subst": "s"}, | ||||||
|  |             { | ||||||
|  |                 "prefix_len": 1, | ||||||
|  |                 "suffix_len": 0, | ||||||
|  |                 "prefix_tree": 0, | ||||||
|  |                 "suffix_tree": 4294967295, | ||||||
|  |             }, | ||||||
|  |             {"orig": "s", "subst": ""}, | ||||||
|  |             { | ||||||
|  |                 "prefix_len": 0, | ||||||
|  |                 "suffix_len": 1, | ||||||
|  |                 "prefix_tree": 4294967295, | ||||||
|  |                 "suffix_tree": 2, | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "prefix_len": 0, | ||||||
|  |                 "suffix_len": 0, | ||||||
|  |                 "prefix_tree": 4294967295, | ||||||
|  |                 "suffix_tree": 4294967295, | ||||||
|  |             }, | ||||||
|  |             {"orig": "E", "subst": "e"}, | ||||||
|  |             { | ||||||
|  |                 "prefix_len": 1, | ||||||
|  |                 "suffix_len": 0, | ||||||
|  |                 "prefix_tree": 5, | ||||||
|  |                 "suffix_tree": 4294967295, | ||||||
|  |             }, | ||||||
|  |         ], | ||||||
|  |         "labels": (1, 3, 4, 6), | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_no_data(): | def test_no_data(): | ||||||
|  |  | ||||||
|  | @ -372,24 +372,39 @@ def test_overfitting_IO_overlapping(): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_zero_suggestions(): | def test_zero_suggestions(): | ||||||
|     # Test with a suggester that returns 0 suggestions |     # Test with a suggester that can return 0 suggestions | ||||||
| 
 | 
 | ||||||
|     @registry.misc("test_zero_suggester") |     @registry.misc("test_mixed_zero_suggester") | ||||||
|     def make_zero_suggester(): |     def make_mixed_zero_suggester(): | ||||||
|         def zero_suggester(docs, *, ops=None): |         def mixed_zero_suggester(docs, *, ops=None): | ||||||
|             if ops is None: |             if ops is None: | ||||||
|                 ops = get_current_ops() |                 ops = get_current_ops() | ||||||
|             return Ragged( |             spans = [] | ||||||
|                 ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") |             lengths = [] | ||||||
|             ) |             for doc in docs: | ||||||
|  |                 if len(doc) > 0 and len(doc) % 2 == 0: | ||||||
|  |                     spans.append((0, 1)) | ||||||
|  |                     lengths.append(1) | ||||||
|  |                 else: | ||||||
|  |                     lengths.append(0) | ||||||
|  |             spans = ops.asarray2i(spans) | ||||||
|  |             lengths_array = ops.asarray1i(lengths) | ||||||
|  |             if len(spans) > 0: | ||||||
|  |                 output = Ragged(ops.xp.vstack(spans), lengths_array) | ||||||
|  |             else: | ||||||
|  |                 output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) | ||||||
|  |             return output | ||||||
| 
 | 
 | ||||||
|         return zero_suggester |         return mixed_zero_suggester | ||||||
| 
 | 
 | ||||||
|     fix_random_seed(0) |     fix_random_seed(0) | ||||||
|     nlp = English() |     nlp = English() | ||||||
|     spancat = nlp.add_pipe( |     spancat = nlp.add_pipe( | ||||||
|         "spancat", |         "spancat", | ||||||
|         config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, |         config={ | ||||||
|  |             "suggester": {"@misc": "test_mixed_zero_suggester"}, | ||||||
|  |             "spans_key": SPAN_KEY, | ||||||
|  |         }, | ||||||
|     ) |     ) | ||||||
|     train_examples = make_examples(nlp) |     train_examples = make_examples(nlp) | ||||||
|     optimizer = nlp.initialize(get_examples=lambda: train_examples) |     optimizer = nlp.initialize(get_examples=lambda: train_examples) | ||||||
|  | @ -397,6 +412,16 @@ def test_zero_suggestions(): | ||||||
|     assert set(spancat.labels) == {"LOC", "PERSON"} |     assert set(spancat.labels) == {"LOC", "PERSON"} | ||||||
| 
 | 
 | ||||||
|     nlp.update(train_examples, sgd=optimizer) |     nlp.update(train_examples, sgd=optimizer) | ||||||
|  |     # empty doc | ||||||
|  |     nlp("") | ||||||
|  |     # single doc with zero suggestions | ||||||
|  |     nlp("one") | ||||||
|  |     # single doc with one suggestion | ||||||
|  |     nlp("two two") | ||||||
|  |     # batch with mixed zero/one suggestions | ||||||
|  |     list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"])) | ||||||
|  |     # batch with no suggestions | ||||||
|  |     list(nlp.pipe(["", "one", "three three three"])) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_set_candidates(): | def test_set_candidates(): | ||||||
|  |  | ||||||
|  | @ -838,8 +838,8 @@ def test_textcat_loss(multi_label: bool, expected_loss: float): | ||||||
|         textcat = nlp.add_pipe("textcat_multilabel") |         textcat = nlp.add_pipe("textcat_multilabel") | ||||||
|     else: |     else: | ||||||
|         textcat = nlp.add_pipe("textcat") |         textcat = nlp.add_pipe("textcat") | ||||||
|     textcat.initialize(lambda: train_examples) |  | ||||||
|     assert isinstance(textcat, TextCategorizer) |     assert isinstance(textcat, TextCategorizer) | ||||||
|  |     textcat.initialize(lambda: train_examples) | ||||||
|     scores = textcat.model.ops.asarray( |     scores = textcat.model.ops.asarray( | ||||||
|         [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f"  # type: ignore |         [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0]], dtype="f"  # type: ignore | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|  | @ -3,6 +3,7 @@ import math | ||||||
| from collections import Counter | from collections import Counter | ||||||
| from typing import Tuple, List, Dict, Any | from typing import Tuple, List, Dict, Any | ||||||
| import pkg_resources | import pkg_resources | ||||||
|  | import time | ||||||
| 
 | 
 | ||||||
| import numpy | import numpy | ||||||
| import pytest | import pytest | ||||||
|  | @ -28,6 +29,7 @@ from spacy.cli.download import get_compatibility, get_version | ||||||
| from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config | from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config | ||||||
| from spacy.cli.package import get_third_party_dependencies | from spacy.cli.package import get_third_party_dependencies | ||||||
| from spacy.cli.package import _is_permitted_package_name | from spacy.cli.package import _is_permitted_package_name | ||||||
|  | from spacy.cli.project.remote_storage import RemoteStorage | ||||||
| from spacy.cli.project.run import _check_requirements | from spacy.cli.project.run import _check_requirements | ||||||
| from spacy.cli.validate import get_model_pkgs | from spacy.cli.validate import get_model_pkgs | ||||||
| from spacy.cli.find_threshold import find_threshold | from spacy.cli.find_threshold import find_threshold | ||||||
|  | @ -121,6 +123,25 @@ def test_issue7055(): | ||||||
|     assert "model" in filled_cfg["components"]["ner"] |     assert "model" in filled_cfg["components"]["ner"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.issue(11235) | ||||||
|  | def test_issue11235(): | ||||||
|  |     """ | ||||||
|  |     Test that the cli handles interpolation in the directory names correctly when loading project config. | ||||||
|  |     """ | ||||||
|  |     lang_var = "en" | ||||||
|  |     variables = {"lang": lang_var} | ||||||
|  |     commands = [{"name": "x", "script": ["hello ${vars.lang}"]}] | ||||||
|  |     directories = ["cfg", "${vars.lang}_model"] | ||||||
|  |     project = {"commands": commands, "vars": variables, "directories": directories} | ||||||
|  |     with make_tempdir() as d: | ||||||
|  |         srsly.write_yaml(d / "project.yml", project) | ||||||
|  |         cfg = load_project_config(d) | ||||||
|  |         # Check that the directories are interpolated and created correctly | ||||||
|  |         assert os.path.exists(d / "cfg") | ||||||
|  |         assert os.path.exists(d / f"{lang_var}_model") | ||||||
|  |     assert cfg["commands"][0]["script"][0] == f"hello {lang_var}" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_cli_info(): | def test_cli_info(): | ||||||
|     nlp = Dutch() |     nlp = Dutch() | ||||||
|     nlp.add_pipe("textcat") |     nlp.add_pipe("textcat") | ||||||
|  | @ -594,6 +615,7 @@ def test_string_to_list_intify(value): | ||||||
|     assert string_to_list(value, intify=True) == [1, 2, 3] |     assert string_to_list(value, intify=True) == [1, 2, 3] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.skip(reason="Temporarily skip for dev version") | ||||||
| def test_download_compatibility(): | def test_download_compatibility(): | ||||||
|     spec = SpecifierSet("==" + about.__version__) |     spec = SpecifierSet("==" + about.__version__) | ||||||
|     spec.prereleases = False |     spec.prereleases = False | ||||||
|  | @ -604,6 +626,7 @@ def test_download_compatibility(): | ||||||
|         assert get_minor_version(about.__version__) == get_minor_version(version) |         assert get_minor_version(about.__version__) == get_minor_version(version) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.skip(reason="Temporarily skip for dev version") | ||||||
| def test_validate_compatibility_table(): | def test_validate_compatibility_table(): | ||||||
|     spec = SpecifierSet("==" + about.__version__) |     spec = SpecifierSet("==" + about.__version__) | ||||||
|     spec.prereleases = False |     spec.prereleases = False | ||||||
|  | @ -862,6 +885,60 @@ def test_span_length_freq_dist_output_must_be_correct(): | ||||||
|     assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] |     assert list(span_freqs.keys()) == [3, 1, 4, 5, 2] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_local_remote_storage(): | ||||||
|  |     with make_tempdir() as d: | ||||||
|  |         filename = "a.txt" | ||||||
|  | 
 | ||||||
|  |         content_hashes = ("aaaa", "cccc", "bbbb") | ||||||
|  |         for i, content_hash in enumerate(content_hashes): | ||||||
|  |             # make sure that each subsequent file has a later timestamp | ||||||
|  |             if i > 0: | ||||||
|  |                 time.sleep(1) | ||||||
|  |             content = f"{content_hash} content" | ||||||
|  |             loc_file = d / "root" / filename | ||||||
|  |             if not loc_file.parent.exists(): | ||||||
|  |                 loc_file.parent.mkdir(parents=True) | ||||||
|  |             with loc_file.open(mode="w") as file_: | ||||||
|  |                 file_.write(content) | ||||||
|  | 
 | ||||||
|  |             # push first version to remote storage | ||||||
|  |             remote = RemoteStorage(d / "root", str(d / "remote")) | ||||||
|  |             remote.push(filename, "aaaa", content_hash) | ||||||
|  | 
 | ||||||
|  |             # retrieve with full hashes | ||||||
|  |             loc_file.unlink() | ||||||
|  |             remote.pull(filename, command_hash="aaaa", content_hash=content_hash) | ||||||
|  |             with loc_file.open(mode="r") as file_: | ||||||
|  |                 assert file_.read() == content | ||||||
|  | 
 | ||||||
|  |             # retrieve with command hash | ||||||
|  |             loc_file.unlink() | ||||||
|  |             remote.pull(filename, command_hash="aaaa") | ||||||
|  |             with loc_file.open(mode="r") as file_: | ||||||
|  |                 assert file_.read() == content | ||||||
|  | 
 | ||||||
|  |             # retrieve with content hash | ||||||
|  |             loc_file.unlink() | ||||||
|  |             remote.pull(filename, content_hash=content_hash) | ||||||
|  |             with loc_file.open(mode="r") as file_: | ||||||
|  |                 assert file_.read() == content | ||||||
|  | 
 | ||||||
|  |             # retrieve with no hashes | ||||||
|  |             loc_file.unlink() | ||||||
|  |             remote.pull(filename) | ||||||
|  |             with loc_file.open(mode="r") as file_: | ||||||
|  |                 assert file_.read() == content | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_local_remote_storage_pull_missing(): | ||||||
|  |     # pulling from a non-existent remote pulls nothing gracefully | ||||||
|  |     with make_tempdir() as d: | ||||||
|  |         filename = "a.txt" | ||||||
|  |         remote = RemoteStorage(d / "root", str(d / "remote")) | ||||||
|  |         assert remote.pull(filename, command_hash="aaaa") is None | ||||||
|  |         assert remote.pull(filename) is None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_cli_find_threshold(capsys): | def test_cli_find_threshold(capsys): | ||||||
|     thresholds = numpy.linspace(0, 1, 10) |     thresholds = numpy.linspace(0, 1, 10) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -359,6 +359,7 @@ cdef class Doc: | ||||||
|             for annot in annotations: |             for annot in annotations: | ||||||
|                 if annot: |                 if annot: | ||||||
|                     if annot is heads or annot is sent_starts or annot is ent_iobs: |                     if annot is heads or annot is sent_starts or annot is ent_iobs: | ||||||
|  |                         annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) | ||||||
|                         for i in range(len(words)): |                         for i in range(len(words)): | ||||||
|                             if attrs.ndim == 1: |                             if attrs.ndim == 1: | ||||||
|                                 attrs[i] = annot[i] |                                 attrs[i] = annot[i] | ||||||
|  | @ -1558,6 +1559,7 @@ cdef class Doc: | ||||||
| 
 | 
 | ||||||
|             for j, (attr, annot) in enumerate(token_annotations.items()): |             for j, (attr, annot) in enumerate(token_annotations.items()): | ||||||
|                 if attr is HEAD: |                 if attr is HEAD: | ||||||
|  |                     annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64) | ||||||
|                     for i in range(len(words)): |                     for i in range(len(words)): | ||||||
|                         array[i, j] = annot[i] |                         array[i, j] = annot[i] | ||||||
|                 elif attr is MORPH: |                 elif attr is MORPH: | ||||||
|  |  | ||||||
|  | @ -299,7 +299,7 @@ cdef class Span: | ||||||
|                     for ancestor in ancestors: |                     for ancestor in ancestors: | ||||||
|                         ancestor_i = ancestor.i - self.c.start |                         ancestor_i = ancestor.i - self.c.start | ||||||
|                         if ancestor_i in range(length): |                         if ancestor_i in range(length): | ||||||
|                             array[i, head_col] = ancestor_i - i |                             array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64) | ||||||
| 
 | 
 | ||||||
|                 # if there is no appropriate ancestor, define a new artificial root |                 # if there is no appropriate ancestor, define a new artificial root | ||||||
|                 value = array[i, head_col] |                 value = array[i, head_col] | ||||||
|  | @ -307,7 +307,7 @@ cdef class Span: | ||||||
|                     new_root = old_to_new_root.get(ancestor_i, None) |                     new_root = old_to_new_root.get(ancestor_i, None) | ||||||
|                     if new_root is not None: |                     if new_root is not None: | ||||||
|                         # take the same artificial root as a previous token from the same sentence |                         # take the same artificial root as a previous token from the same sentence | ||||||
|                         array[i, head_col] = new_root - i |                         array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64) | ||||||
|                     else: |                     else: | ||||||
|                         # set this token as the new artificial root |                         # set this token as the new artificial root | ||||||
|                         array[i, head_col] = 0 |                         array[i, head_col] = 0 | ||||||
|  |  | ||||||
|  | @ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot): | ||||||
|         if key not in IDS: |         if key not in IDS: | ||||||
|             raise ValueError(Errors.E974.format(obj="token", key=key)) |             raise ValueError(Errors.E974.format(obj="token", key=key)) | ||||||
|         elif key in ["ORTH", "SPACY"]: |         elif key in ["ORTH", "SPACY"]: | ||||||
|             pass |             continue | ||||||
|         elif key == "HEAD": |         elif key == "HEAD": | ||||||
|             attrs.append(key) |             attrs.append(key) | ||||||
|             values.append([h-i if h is not None else 0 for i, h in enumerate(value)]) |             row = [h-i if h is not None else 0 for i, h in enumerate(value)] | ||||||
|         elif key == "DEP": |         elif key == "DEP": | ||||||
|             attrs.append(key) |             attrs.append(key) | ||||||
|             values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]) |             row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value] | ||||||
|         elif key == "SENT_START": |         elif key == "SENT_START": | ||||||
|             attrs.append(key) |             attrs.append(key) | ||||||
|             values.append([to_ternary_int(v) for v in value]) |             row = [to_ternary_int(v) for v in value] | ||||||
|         elif key == "MORPH": |         elif key == "MORPH": | ||||||
|             attrs.append(key) |             attrs.append(key) | ||||||
|             values.append([vocab.morphology.add(v) for v in value]) |             row = [vocab.morphology.add(v) for v in value] | ||||||
|         else: |         else: | ||||||
|             attrs.append(key) |             attrs.append(key) | ||||||
|             if not all(isinstance(v, str) for v in value): |             if not all(isinstance(v, str) for v in value): | ||||||
|                 types = set([type(v) for v in value]) |                 types = set([type(v) for v in value]) | ||||||
|                 raise TypeError(Errors.E969.format(field=key, types=types)) from None |                 raise TypeError(Errors.E969.format(field=key, types=types)) from None | ||||||
|             values.append([vocab.strings.add(v) for v in value]) |             row = [vocab.strings.add(v) for v in value] | ||||||
|     array = numpy.asarray(values, dtype="uint64") |         values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row]) | ||||||
|  |     array = numpy.array(values, dtype=numpy.uint64) | ||||||
|     return attrs, array.T |     return attrs, array.T | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -51,8 +51,7 @@ from . import about | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     # This lets us add type hints for mypy etc. without causing circular imports |     # This lets us add type hints for mypy etc. without causing circular imports | ||||||
|     from .language import Language  # noqa: F401 |     from .language import Language, PipeCallable  # noqa: F401 | ||||||
|     from .pipeline import Pipe  # noqa: F401 |  | ||||||
|     from .tokens import Doc, Span  # noqa: F401 |     from .tokens import Doc, Span  # noqa: F401 | ||||||
|     from .vocab import Vocab  # noqa: F401 |     from .vocab import Vocab  # noqa: F401 | ||||||
| 
 | 
 | ||||||
|  | @ -1642,9 +1641,11 @@ def check_bool_env_var(env_var: str) -> bool: | ||||||
| 
 | 
 | ||||||
| def _pipe( | def _pipe( | ||||||
|     docs: Iterable["Doc"], |     docs: Iterable["Doc"], | ||||||
|     proc: "Pipe", |     proc: "PipeCallable", | ||||||
|     name: str, |     name: str, | ||||||
|     default_error_handler: Callable[[str, "Pipe", List["Doc"], Exception], NoReturn], |     default_error_handler: Callable[ | ||||||
|  |         [str, "PipeCallable", List["Doc"], Exception], NoReturn | ||||||
|  |     ], | ||||||
|     kwargs: Mapping[str, Any], |     kwargs: Mapping[str, Any], | ||||||
| ) -> Iterator["Doc"]: | ) -> Iterator["Doc"]: | ||||||
|     if hasattr(proc, "pipe"): |     if hasattr(proc, "pipe"): | ||||||
|  |  | ||||||
|  | @ -1391,12 +1391,13 @@ If the contents are different, the new version of the file is uploaded. Deleting | ||||||
| obsolete files is left up to you. | obsolete files is left up to you. | ||||||
| 
 | 
 | ||||||
| Remotes can be defined in the `remotes` section of the | Remotes can be defined in the `remotes` section of the | ||||||
| [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the | [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses | ||||||
| [`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to | [`Pathy`](https://github.com/justindujardin/pathy) to communicate with the | ||||||
| communicate with the remote storages, so you can use any protocol that | remote storages, so you can use any protocol that `Pathy` supports, including | ||||||
| `smart-open` supports, including [S3](https://aws.amazon.com/s3/), | [S3](https://aws.amazon.com/s3/), | ||||||
| [Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although | [Google Cloud Storage](https://cloud.google.com/storage), and the local | ||||||
| you may need to install extra dependencies to use certain protocols. | filesystem, although you may need to install extra dependencies to use certain | ||||||
|  | protocols. | ||||||
| 
 | 
 | ||||||
| ```cli | ```cli | ||||||
| $ python -m spacy project push [remote] [project_dir] | $ python -m spacy project push [remote] [project_dir] | ||||||
|  | @ -1435,12 +1436,13 @@ outputs, so if you change the config back, you'll be able to fetch back the | ||||||
| result. | result. | ||||||
| 
 | 
 | ||||||
| Remotes can be defined in the `remotes` section of the | Remotes can be defined in the `remotes` section of the | ||||||
| [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the | [`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses | ||||||
| [`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to | [`Pathy`](https://github.com/justindujardin/pathy) to communicate with the | ||||||
| communicate with the remote storages, so you can use any protocol that | remote storages, so you can use any protocol that `Pathy` supports, including | ||||||
| `smart-open` supports, including [S3](https://aws.amazon.com/s3/), | [S3](https://aws.amazon.com/s3/), | ||||||
| [Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although | [Google Cloud Storage](https://cloud.google.com/storage), and the local | ||||||
| you may need to install extra dependencies to use certain protocols. | filesystem, although you may need to install extra dependencies to use certain | ||||||
|  | protocols. | ||||||
| 
 | 
 | ||||||
| ```cli | ```cli | ||||||
| $ python -m spacy project pull [remote] [project_dir] | $ python -m spacy project pull [remote] [project_dir] | ||||||
|  |  | ||||||
|  | @ -1004,6 +1004,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`. | ||||||
| | `tags`      | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | | `tags`      | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | ||||||
| | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~                                                                                                                                                                                        | | | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~                                                                                                                                                                                        | | ||||||
| 
 | 
 | ||||||
|  | ### training.biluo_to_iob {#biluo_to_iob tag="function"} | ||||||
|  | 
 | ||||||
|  | Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to | ||||||
|  | [IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want | ||||||
|  | use the BILUO tags with a model that only supports IOB tags. | ||||||
|  | 
 | ||||||
|  | > #### Example | ||||||
|  | > | ||||||
|  | > ```python | ||||||
|  | > from spacy.training import biluo_to_iob | ||||||
|  | > | ||||||
|  | > tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] | ||||||
|  | > iob_tags = biluo_to_iob(tags) | ||||||
|  | > assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"] | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | | Name        | Description                                                                             | | ||||||
|  | | ----------- | --------------------------------------------------------------------------------------- | | ||||||
|  | | `tags`      | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ | | ||||||
|  | | **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~           | | ||||||
|  | 
 | ||||||
|  | ### training.iob_to_biluo {#iob_to_biluo tag="function"} | ||||||
|  | 
 | ||||||
|  | Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to | ||||||
|  | [BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you | ||||||
|  | want use the IOB tags with a model that only supports BILUO tags. | ||||||
|  | 
 | ||||||
|  | <Infobox title="Changed in v3.0" variant="warning" id="iob_to_biluo"> | ||||||
|  | 
 | ||||||
|  | This method was previously available as `spacy.gold.iob_to_biluo`. | ||||||
|  | 
 | ||||||
|  | </Infobox> | ||||||
|  | 
 | ||||||
|  | > #### Example | ||||||
|  | > | ||||||
|  | > ```python | ||||||
|  | > from spacy.training import iob_to_biluo | ||||||
|  | > | ||||||
|  | > tags = ["O", "O", "B-LOC", "I-LOC", "O"] | ||||||
|  | > biluo_tags = iob_to_biluo(tags) | ||||||
|  | > assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"] | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | | Name        | Description                                                                           | | ||||||
|  | | ----------- | ------------------------------------------------------------------------------------- | | ||||||
|  | | `tags`      | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ | | ||||||
|  | | **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~       | | ||||||
|  | 
 | ||||||
| ## Utility functions {#util source="spacy/util.py"} | ## Utility functions {#util source="spacy/util.py"} | ||||||
| 
 | 
 | ||||||
| spaCy comes with a small collection of utility functions located in | spaCy comes with a small collection of utility functions located in | ||||||
|  |  | ||||||
|  | @ -308,14 +308,14 @@ Load state from a binary string. | ||||||
| > assert type(PERSON) == int | > assert type(PERSON) == int | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name                                           | Description                                                                                                                                                            | | | Name                                           | Description                                                                                                                                                             | | ||||||
| | ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `strings`                                      | A table managing the string-to-int mapping. ~~StringStore~~                                                                                                            | | | `strings`                                      | A table managing the string-to-int mapping. ~~StringStore~~                                                                                                             | | ||||||
| | `vectors`                                      | A table associating word IDs to word vectors. ~~Vectors~~                                                                                                              | | | `vectors`                                      | A table associating word IDs to word vectors. ~~Vectors~~                                                                                                               | | ||||||
| | `vectors_length`                               | Number of dimensions for each word vector. ~~int~~                                                                                                                     | | | `vectors_length`                               | Number of dimensions for each word vector. ~~int~~                                                                                                                      | | ||||||
| | `lookups`                                      | The available lookup tables in this vocab. ~~Lookups~~                                                                                                                 | | | `lookups`                                      | The available lookup tables in this vocab. ~~Lookups~~                                                                                                                  | | ||||||
| | `writing_system`                               | A dict with information about the language's writing system. ~~Dict[str, Any]~~                                                                                        | | | `writing_system`                               | A dict with information about the language's writing system. ~~Dict[str, Any]~~                                                                                         | | ||||||
| | `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | | | `get_noun_chunks` <Tag variant="new">3.0</Tag> | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | | ||||||
| 
 | 
 | ||||||
| ## Serialization fields {#serialization-fields} | ## Serialization fields {#serialization-fields} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -259,9 +259,9 @@ pipelines. | ||||||
| > This can be used in a project command like so: | > This can be used in a project command like so: | ||||||
| > | > | ||||||
| > ```yaml | > ```yaml | ||||||
| >   - name: "echo-path" | > - name: 'echo-path' | ||||||
| >     script: | >   script: | ||||||
| >       - "echo ${env.ENV_PATH}" | >     - 'echo ${env.ENV_PATH}' | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Section                                             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | | | Section                                             | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | | ||||||
|  | @ -643,12 +643,13 @@ locally. | ||||||
| 
 | 
 | ||||||
| You can list one or more remotes in the `remotes` section of your | You can list one or more remotes in the `remotes` section of your | ||||||
| [`project.yml`](#project-yml) by mapping a string name to the URL of the | [`project.yml`](#project-yml) by mapping a string name to the URL of the | ||||||
| storage. Under the hood, spaCy uses the | storage. Under the hood, spaCy uses | ||||||
| [`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to | [`Pathy`](https://github.com/justindujardin/pathy) to communicate with the | ||||||
| communicate with the remote storages, so you can use any protocol that | remote storages, so you can use any protocol that `Pathy` supports, including | ||||||
| `smart-open` supports, including [S3](https://aws.amazon.com/s3/), | [S3](https://aws.amazon.com/s3/), | ||||||
| [Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although | [Google Cloud Storage](https://cloud.google.com/storage), and the local | ||||||
| you may need to install extra dependencies to use certain protocols. | filesystem, although you may need to install extra dependencies to use certain | ||||||
|  | protocols. | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  | @ -661,7 +662,6 @@ you may need to install extra dependencies to use certain protocols. | ||||||
| remotes: | remotes: | ||||||
|   default: 's3://my-spacy-bucket' |   default: 's3://my-spacy-bucket' | ||||||
|   local: '/mnt/scratch/cache' |   local: '/mnt/scratch/cache' | ||||||
|   stuff: 'ssh://myserver.example.com/whatever' |  | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| <Infobox title="How it works" emoji="💡"> | <Infobox title="How it works" emoji="💡"> | ||||||
|  |  | ||||||
|  | @ -66,8 +66,8 @@ The English CNN pipelines have new word vectors: | ||||||
| | Package                                         | Model Version |  TAG | Parser LAS | NER F | | | Package                                         | Model Version |  TAG | Parser LAS | NER F | | ||||||
| | ----------------------------------------------- | ------------- | ---: | ---------: | ----: | | | ----------------------------------------------- | ------------- | ---: | ---------: | ----: | | ||||||
| | [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0        | 97.3 |       90.1 |  84.6 | | | [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0        | 97.3 |       90.1 |  84.6 | | ||||||
| | [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0        | 97.2 |       90.3 |  85.5 | | | [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0        | 97.2 |       90.3 |  85.5 | | ||||||
| | [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0        | 97.4 |       90.1 |  85.3 | | | [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0        | 97.4 |       90.1 |  85.3 | | ||||||
| | [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0        | 97.3 |       90.2 |  85.6 | | | [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0        | 97.3 |       90.2 |  85.6 | | ||||||
| 
 | 
 | ||||||
| ## Notes about upgrading from v3.3 {#upgrading} | ## Notes about upgrading from v3.3 {#upgrading} | ||||||
|  |  | ||||||
|  | @ -45,7 +45,7 @@ | ||||||
|                     { "text": "v2.x Documentation", "url": "https://v2.spacy.io" }, |                     { "text": "v2.x Documentation", "url": "https://v2.spacy.io" }, | ||||||
|                     { |                     { | ||||||
|                         "text": "Custom Solutions", |                         "text": "Custom Solutions", | ||||||
|                         "url": "https://explosion.ai/spacy-tailored-pipelines" |                         "url": "https://explosion.ai/custom-solutions" | ||||||
|                     } |                     } | ||||||
|                 ] |                 ] | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  | @ -51,7 +51,7 @@ | ||||||
|                 { "text": "Online Course", "url": "https://course.spacy.io" }, |                 { "text": "Online Course", "url": "https://course.spacy.io" }, | ||||||
|                 { |                 { | ||||||
|                     "text": "Custom Solutions", |                     "text": "Custom Solutions", | ||||||
|                     "url": "https://explosion.ai/spacy-tailored-pipelines" |                     "url": "https://explosion.ai/custom-solutions" | ||||||
|                 } |                 } | ||||||
|             ] |             ] | ||||||
|         }, |         }, | ||||||
|  |  | ||||||
|  | @ -1023,25 +1023,6 @@ | ||||||
|             }, |             }, | ||||||
|             "category": ["pipeline"] |             "category": ["pipeline"] | ||||||
|         }, |         }, | ||||||
|         { |  | ||||||
|             "id": "spacy-sentence-segmenter", |  | ||||||
|             "title": "Sentence Segmenter", |  | ||||||
|             "slogan": "Custom sentence segmentation for spaCy", |  | ||||||
|             "code_example": [ |  | ||||||
|                 "from seg.newline.segmenter import NewLineSegmenter", |  | ||||||
|                 "import spacy", |  | ||||||
|                 "", |  | ||||||
|                 "nlseg = NewLineSegmenter()", |  | ||||||
|                 "nlp = spacy.load('en')", |  | ||||||
|                 "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')", |  | ||||||
|                 "doc = nlp(my_doc_text)" |  | ||||||
|             ], |  | ||||||
|             "author": "tc64", |  | ||||||
|             "author_links": { |  | ||||||
|                 "github": "tc64" |  | ||||||
|             }, |  | ||||||
|             "category": ["pipeline"] |  | ||||||
|         }, |  | ||||||
|         { |         { | ||||||
|             "id": "spacy_cld", |             "id": "spacy_cld", | ||||||
|             "title": "spaCy-CLD", |             "title": "spaCy-CLD", | ||||||
|  | @ -1468,13 +1449,26 @@ | ||||||
|             "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png", |             "image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png", | ||||||
|             "code_example": [ |             "code_example": [ | ||||||
|                 "import spacy", |                 "import spacy", | ||||||
|                 "import scattertext as st", |  | ||||||
|                 "", |                 "", | ||||||
|                 "nlp = spacy.load('en')", |                 "from scattertext import SampleCorpora, produce_scattertext_explorer", | ||||||
|                 "corpus = st.CorpusFromPandas(convention_df,", |                 "from scattertext import produce_scattertext_html", | ||||||
|                 "                             category_col='party',", |                 "from scattertext.CorpusFromPandas import CorpusFromPandas", | ||||||
|                 "                             text_col='text',", |                 "", | ||||||
|                 "                             nlp=nlp).build()" |                 "nlp = spacy.load('en_core_web_sm')", | ||||||
|  |                 "convention_df = SampleCorpora.ConventionData2012.get_data()", | ||||||
|  |                 "corpus = CorpusFromPandas(convention_df,", | ||||||
|  |                 "                          category_col='party',", | ||||||
|  |                 "                          text_col='text',", | ||||||
|  |                 "                          nlp=nlp).build()", | ||||||
|  |                 "", | ||||||
|  |                 "html = produce_scattertext_html(corpus,", | ||||||
|  |                 "                                    category='democrat',", | ||||||
|  |                 "                                    category_name='Democratic',", | ||||||
|  |                 "                                    not_category_name='Republican',", | ||||||
|  |                 "                                    minimum_term_frequency=5,", | ||||||
|  |                 "                                    width_in_pixels=1000)", | ||||||
|  |                 "open('./simple.html', 'wb').write(html.encode('utf-8'))", | ||||||
|  |                 "print('Open ./simple.html in Chrome or Firefox.')" | ||||||
|             ], |             ], | ||||||
|             "author": "Jason Kessler", |             "author": "Jason Kessler", | ||||||
|             "author_links": { |             "author_links": { | ||||||
|  |  | ||||||
|  | @ -105,13 +105,13 @@ const Landing = ({ data }) => { | ||||||
| 
 | 
 | ||||||
|             <LandingBannerGrid> |             <LandingBannerGrid> | ||||||
|                 <LandingBanner |                 <LandingBanner | ||||||
|                     to="https://explosion.ai/spacy-tailored-pipelines" |                     to="https://explosion.ai/custom-solutions" | ||||||
|                     button="Learn more" |                     button="Learn more" | ||||||
|                     background="#E4F4F9" |                     background="#E4F4F9" | ||||||
|                     color="#1e1935" |                     color="#1e1935" | ||||||
|                     small |                     small | ||||||
|                 > |                 > | ||||||
|                     <Link to="https://explosion.ai/spacy-tailored-pipelines" hidden> |                     <Link to="https://explosion.ai/custom-solutions" hidden> | ||||||
|                         <img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" /> |                         <img src={tailoredPipelinesImage} alt="spaCy Tailored Pipelines" /> | ||||||
|                     </Link> |                     </Link> | ||||||
|                     <strong> |                     <strong> | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user