mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/master' into maintenance/v4-merge-master-20240119
This commit is contained in:
		
						commit
						81beaea70e
					
				
							
								
								
									
										1
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions] | ||||
							
								
								
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -58,7 +58,7 @@ jobs: | |||
|       fail-fast: true | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|         python_version: ["3.11"] | ||||
|         python_version: ["3.12"] | ||||
|         include: | ||||
|           - os: macos-latest | ||||
|             python_version: "3.8" | ||||
|  | @ -66,6 +66,8 @@ jobs: | |||
|             python_version: "3.9" | ||||
|           - os: windows-latest | ||||
|             python_version: "3.10" | ||||
|           - os: macos-latest | ||||
|             python_version: "3.11" | ||||
| 
 | ||||
|     runs-on: ${{ matrix.os }} | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							|  | @ -1,6 +1,6 @@ | |||
| The MIT License (MIT) | ||||
| 
 | ||||
| Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal | ||||
| Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal | ||||
| 
 | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
|  |  | |||
							
								
								
									
										79
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										79
									
								
								README.md
									
									
									
									
									
								
							|  | @ -6,23 +6,20 @@ spaCy is a library for **advanced Natural Language Processing** in Python and | |||
| Cython. It's built on the very latest research, and was designed from day one to | ||||
| be used in real products. | ||||
| 
 | ||||
| spaCy comes with | ||||
| [pretrained pipelines](https://spacy.io/models) and | ||||
| currently supports tokenization and training for **70+ languages**. It features | ||||
| state-of-the-art speed and **neural network models** for tagging, | ||||
| parsing, **named entity recognition**, **text classification** and more, | ||||
| multi-task learning with pretrained **transformers** like BERT, as well as a | ||||
| spaCy comes with [pretrained pipelines](https://spacy.io/models) and currently | ||||
| supports tokenization and training for **70+ languages**. It features | ||||
| state-of-the-art speed and **neural network models** for tagging, parsing, | ||||
| **named entity recognition**, **text classification** and more, multi-task | ||||
| learning with pretrained **transformers** like BERT, as well as a | ||||
| production-ready [**training system**](https://spacy.io/usage/training) and easy | ||||
| model packaging, deployment and workflow management. spaCy is commercial | ||||
| open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). | ||||
| open-source software, released under the | ||||
| [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). | ||||
| 
 | ||||
| 💥 **We'd love to hear more about your experience with spaCy!** | ||||
| [Fill out our survey here.](https://form.typeform.com/to/aMel9q9f) | ||||
| 
 | ||||
| 💫 **Version 3.5 out now!** | ||||
| 💫 **Version 3.7 out now!** | ||||
| [Check out the release notes here.](https://github.com/explosion/spaCy/releases) | ||||
| 
 | ||||
| [](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) | ||||
| [](https://github.com/explosion/spaCy/actions/workflows/tests.yml) | ||||
| [](https://github.com/explosion/spaCy/releases) | ||||
| [](https://pypi.org/project/spacy/) | ||||
| [](https://anaconda.org/conda-forge/spacy) | ||||
|  | @ -35,35 +32,42 @@ open-source software, released under the [MIT license](https://github.com/explos | |||
| 
 | ||||
| ## 📖 Documentation | ||||
| 
 | ||||
| | Documentation                 |                                                                        | | ||||
| | ----------------------------- | ---------------------------------------------------------------------- | | ||||
| | ⭐️ **[spaCy 101]**           | New to spaCy? Here's everything you need to know!                      | | ||||
| | 📚 **[Usage Guides]**         | How to use spaCy and its features.                                     | | ||||
| | 🚀 **[New in v3.0]**          | New features, backwards incompatibilities and migration guide.         | | ||||
| | 🪐 **[Project Templates]**    | End-to-end workflows you can clone, modify and run.                    | | ||||
| | 🎛 **[API Reference]**         | The detailed reference for spaCy's API.                                | | ||||
| | 📦 **[Models]**               | Download trained pipelines for spaCy.                                  | | ||||
| | 🌌 **[Universe]**             | Plugins, extensions, demos and books from the spaCy ecosystem.         | | ||||
| | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. | | ||||
| | 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | ||||
| | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | ||||
| | 🛠 **[Changelog]** | Changes and version history. | | ||||
| | 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | ||||
| | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** | | ||||
| | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** | | ||||
| | Documentation                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                                              | | ||||
| | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | ⭐️ **[spaCy 101]**                                                                                                                                                                                                       | New to spaCy? Here's everything you need to know!                                                                                                                                                                                                                                                                                            | | ||||
| | 📚 **[Usage Guides]**                                                                                                                                                                                                     | How to use spaCy and its features.                                                                                                                                                                                                                                                                                                           | | ||||
| | 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                                               | | ||||
| | 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                                          | | ||||
| | 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                                      | | ||||
| | ⏩ **[GPU Processing]**                                                                                                                                                                                                    | Use spaCy with CUDA-compatible GPU processing.                                                                                                                                                                                                                                                                                               | | ||||
| | 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                                        | | ||||
| | 🦙 **[Large Language Models]**                                                                                                                                                                                            | Integrate LLMs into spaCy pipelines.                                                                                                                                                                                                                                                                                                        | | ||||
| | 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                                               | | ||||
| | ⚙️ **[spaCy VS Code Extension]**                                                                                                                                                                                          | Additional tooling and features for working with spaCy's config files.                                                                                                                                                                                                                                                                       | | ||||
| | 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                                      | | ||||
| | 📰 **[Blog]**                                                                                                                                                                                                             | Read about current spaCy and Prodigy development, releases, talks and more from Explosion.                                                                                                                                                                                                                 | | ||||
| | 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    | | ||||
| | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 | | ||||
| | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        | | ||||
| | 👕 **[Swag]**                                                                                                                                                                                                             | Support us and our work with unique, custom-designed swag!                                                                                                                                                                                                                                                                                   | | ||||
| | <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)**                 | | ||||
| 
 | ||||
| [spacy 101]: https://spacy.io/usage/spacy-101 | ||||
| [new in v3.0]: https://spacy.io/usage/v3 | ||||
| [usage guides]: https://spacy.io/usage/ | ||||
| [api reference]: https://spacy.io/api/ | ||||
| [gpu processing]: https://spacy.io/usage#gpu | ||||
| [models]: https://spacy.io/models | ||||
| [large language models]: https://spacy.io/usage/large-language-models | ||||
| [universe]: https://spacy.io/universe | ||||
| [spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode | ||||
| [spacy vs code extension]: https://github.com/explosion/spacy-vscode | ||||
| [videos]: https://www.youtube.com/c/ExplosionAI | ||||
| [online course]: https://course.spacy.io | ||||
| [blog]: https://explosion.ai | ||||
| [project templates]: https://github.com/explosion/projects | ||||
| [changelog]: https://spacy.io/usage#changelog | ||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||
| [swag]: https://explosion.ai/merch | ||||
| 
 | ||||
| ## 💬 Where to ask questions | ||||
| 
 | ||||
|  | @ -92,7 +96,9 @@ more people can benefit from it. | |||
| - State-of-the-art speed | ||||
| - Production-ready **training system** | ||||
| - Linguistically-motivated **tokenization** | ||||
| - Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more | ||||
| - Components for named **entity recognition**, part-of-speech-tagging, | ||||
|   dependency parsing, sentence segmentation, **text classification**, | ||||
|   lemmatization, morphological analysis, entity linking and more | ||||
| - Easily extensible with **custom components** and attributes | ||||
| - Support for custom models in **PyTorch**, **TensorFlow** and other frameworks | ||||
| - Built in **visualizers** for syntax and NER | ||||
|  | @ -118,8 +124,8 @@ For detailed installation instructions, see the | |||
| ### pip | ||||
| 
 | ||||
| Using pip, spaCy releases are available as source packages and binary wheels. | ||||
| Before you install spaCy and its dependencies, make sure that | ||||
| your `pip`, `setuptools` and `wheel` are up to date. | ||||
| Before you install spaCy and its dependencies, make sure that your `pip`, | ||||
| `setuptools` and `wheel` are up to date. | ||||
| 
 | ||||
| ```bash | ||||
| pip install -U pip setuptools wheel | ||||
|  | @ -174,9 +180,9 @@ with the new version. | |||
| 
 | ||||
| ## 📦 Download model packages | ||||
| 
 | ||||
| Trained pipelines for spaCy can be installed as **Python packages**. This | ||||
| means that they're a component of your application, just like any other module. | ||||
| Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download) | ||||
| Trained pipelines for spaCy can be installed as **Python packages**. This means | ||||
| that they're a component of your application, just like any other module. Models | ||||
| can be installed using spaCy's [`download`](https://spacy.io/api/cli#download) | ||||
| command, or manually by pointing pip to a path or URL. | ||||
| 
 | ||||
| | Documentation              |                                                                  | | ||||
|  | @ -242,8 +248,7 @@ do that depends on your system. | |||
| | **Mac**     | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled.                                                                                        | | ||||
| | **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. | | ||||
| 
 | ||||
| For more details | ||||
| and instructions, see the documentation on | ||||
| For more details and instructions, see the documentation on | ||||
| [compiling spaCy from source](https://spacy.io/usage#source) and the | ||||
| [quickstart widget](https://spacy.io/usage#section-quickstart) to get the right | ||||
| commands for your platform and Python version. | ||||
|  |  | |||
|  | @ -1,7 +1,4 @@ | |||
| # build version constraints for use with wheelwright + multibuild | ||||
| # build version constraints for use with wheelwright | ||||
| numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' | ||||
| numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' | ||||
| numpy==1.19.3; python_version=='3.9' | ||||
| numpy==1.21.3; python_version=='3.10' | ||||
| numpy==1.23.2; python_version=='3.11' | ||||
| numpy; python_version>='3.12' | ||||
| numpy>=1.25.0; python_version>='3.9' | ||||
|  |  | |||
|  | @ -1,14 +1,17 @@ | |||
| # Listeners | ||||
| 
 | ||||
| 1. [Overview](#1-overview) | ||||
| 2. [Initialization](#2-initialization) | ||||
|    - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component) | ||||
|    - [B. Shape inference](#2b-shape-inference) | ||||
| 3. [Internal communication](#3-internal-communication) | ||||
|    - [A. During prediction](#3a-during-prediction) | ||||
|    - [B. During training](#3b-during-training) | ||||
|    - [C. Frozen components](#3c-frozen-components) | ||||
| 4. [Replacing listener with standalone](#4-replacing-listener-with-standalone) | ||||
| - [1. Overview](#1-overview) | ||||
| - [2. Initialization](#2-initialization) | ||||
|   - [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component) | ||||
|   - [2B. Shape inference](#2b-shape-inference) | ||||
| - [3. Internal communication](#3-internal-communication) | ||||
|   - [3A. During prediction](#3a-during-prediction) | ||||
|   - [3B. During training](#3b-during-training) | ||||
|     - [Training with multiple listeners](#training-with-multiple-listeners) | ||||
|   - [3C. Frozen components](#3c-frozen-components) | ||||
|     - [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen) | ||||
|     - [The upstream component is frozen](#the-upstream-component-is-frozen) | ||||
| - [4. Replacing listener with standalone](#4-replacing-listener-with-standalone) | ||||
| 
 | ||||
| ## 1. Overview | ||||
| 
 | ||||
|  | @ -62,7 +65,7 @@ of this `find_listener()` method will specifically identify sublayers of a model | |||
| 
 | ||||
| If it's a Transformer-based pipeline, a | ||||
| [`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py) | ||||
| has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`  | ||||
| has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener` | ||||
| sublayers of downstream components. | ||||
| 
 | ||||
| ### 2B. Shape inference | ||||
|  | @ -154,7 +157,7 @@ as a tagger or a parser. This used to be impossible before 3.1, but has become s | |||
| embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components) | ||||
| list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes. | ||||
| 
 | ||||
| However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related  | ||||
| However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related | ||||
| listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`. | ||||
| 
 | ||||
| #### The upstream component is frozen | ||||
|  | @ -216,5 +219,17 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model) | |||
| ``` | ||||
| 
 | ||||
| The new config and model are then properly stored on the `nlp` object. | ||||
| Note that this functionality (running the replacement for a transformer listener) was broken prior to  | ||||
| Note that this functionality (running the replacement for a transformer listener) was broken prior to | ||||
| `spacy-transformers` 1.0.5. | ||||
| 
 | ||||
| In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback: | ||||
| the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity, | ||||
| the method only passes these extra arguments for callbacks that support them: | ||||
| 
 | ||||
| ``` | ||||
| def replace_listener_pre_37(copied_tok2vec_model): | ||||
|   ... | ||||
| 
 | ||||
| def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe): | ||||
|   ... | ||||
| ``` | ||||
|  |  | |||
|  | @ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
| SOFTWARE. | ||||
| 
 | ||||
| 
 | ||||
| SciPy | ||||
| ----- | ||||
| 
 | ||||
| * Files: scorer.py | ||||
| 
 | ||||
| The implementation of trapezoid() is adapted from SciPy, which is distributed | ||||
| under the following license: | ||||
| 
 | ||||
| New BSD License | ||||
| 
 | ||||
| Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers. | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions | ||||
| are met: | ||||
| 
 | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
|    notice, this list of conditions and the following disclaimer. | ||||
| 
 | ||||
| 2. Redistributions in binary form must reproduce the above | ||||
|    copyright notice, this list of conditions and the following | ||||
|    disclaimer in the documentation and/or other materials provided | ||||
|    with the distribution. | ||||
| 
 | ||||
| 3. Neither the name of the copyright holder nor the names of its | ||||
|    contributors may be used to endorse or promote products derived | ||||
|    from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||||
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||||
| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||||
| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||||
| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||||
| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||||
| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||||
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
|  |  | |||
|  | @ -6,7 +6,8 @@ requires = [ | |||
|     "preshed>=3.0.2,<3.1.0", | ||||
|     "murmurhash>=0.28.0,<1.1.0", | ||||
|     "thinc>=9.0.0.dev4,<9.1.0", | ||||
|     "numpy>=1.15.0", | ||||
|     "numpy>=1.15.0; python_version < '3.9'", | ||||
|     "numpy>=1.25.0; python_version >= '3.9'", | ||||
| ] | ||||
| build-backend = "setuptools.build_meta" | ||||
| 
 | ||||
|  |  | |||
|  | @ -10,13 +10,14 @@ wasabi>=0.9.1,<1.2.0 | |||
| srsly>=2.4.3,<3.0.0 | ||||
| catalogue>=2.0.6,<2.1.0 | ||||
| typer>=0.3.0,<0.10.0 | ||||
| pathy>=0.10.0 | ||||
| smart-open>=5.2.1,<7.0.0 | ||||
| weasel>=0.1.0,<0.4.0 | ||||
| # Third party dependencies | ||||
| numpy>=1.15.0 | ||||
| numpy>=1.15.0; python_version < "3.9" | ||||
| numpy>=1.19.0; python_version >= "3.9" | ||||
| requests>=2.13.0,<3.0.0 | ||||
| tqdm>=4.38.0,<5.0.0 | ||||
| pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0 | ||||
| pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 | ||||
| jinja2 | ||||
| langcodes>=3.2.0,<4.0.0 | ||||
| # Official Python utilities | ||||
|  | @ -36,5 +37,5 @@ types-setuptools>=57.0.0 | |||
| types-requests | ||||
| types-setuptools>=57.0.0 | ||||
| black==22.3.0 | ||||
| cython-lint>=0.15.0; python_version >= "3.7" | ||||
| cython-lint>=0.15.0 | ||||
| isort>=5.0,<6.0 | ||||
|  |  | |||
							
								
								
									
										18
									
								
								setup.cfg
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								setup.cfg
									
									
									
									
									
								
							|  | @ -30,9 +30,12 @@ project_urls = | |||
| zip_safe = false | ||||
| include_package_data = true | ||||
| python_requires = >=3.8 | ||||
| # NOTE: This section is superseded by pyproject.toml and will be removed in | ||||
| # spaCy v4 | ||||
| setup_requires = | ||||
|     cython>=0.25,<3.0 | ||||
|     numpy>=1.15.0 | ||||
|     numpy>=1.15.0; python_version < "3.9" | ||||
|     numpy>=1.19.0; python_version >= "3.9" | ||||
|     # We also need our Cython packages here to compile against | ||||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|  | @ -49,14 +52,15 @@ install_requires = | |||
|     wasabi>=0.9.1,<1.2.0 | ||||
|     srsly>=2.4.3,<3.0.0 | ||||
|     catalogue>=2.0.6,<2.1.0 | ||||
|     weasel>=0.1.0,<0.4.0 | ||||
|     # Third-party dependencies | ||||
|     typer>=0.3.0,<0.10.0 | ||||
|     pathy>=0.10.0 | ||||
|     smart-open>=5.2.1,<7.0.0 | ||||
|     tqdm>=4.38.0,<5.0.0 | ||||
|     numpy>=1.15.0 | ||||
|     numpy>=1.15.0; python_version < "3.9" | ||||
|     numpy>=1.19.0; python_version >= "3.9" | ||||
|     requests>=2.13.0,<3.0.0 | ||||
|     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0 | ||||
|     pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0 | ||||
|     jinja2 | ||||
|     # Official Python utilities | ||||
|     setuptools | ||||
|  | @ -71,9 +75,7 @@ console_scripts = | |||
| lookups = | ||||
|     spacy_lookups_data>=1.0.3,<1.1.0 | ||||
| transformers = | ||||
|     spacy_transformers>=1.1.2,<1.3.0 | ||||
| ray = | ||||
|     spacy_ray>=0.1.0,<1.0.0 | ||||
|     spacy_transformers>=1.1.2,<1.4.0 | ||||
| cuda = | ||||
|     cupy>=5.0.0b4,<13.0.0 | ||||
| cuda80 = | ||||
|  | @ -108,6 +110,8 @@ cuda117 = | |||
|     cupy-cuda117>=5.0.0b4,<13.0.0 | ||||
| cuda11x = | ||||
|     cupy-cuda11x>=11.0.0,<13.0.0 | ||||
| cuda12x = | ||||
|     cupy-cuda12x>=11.5.0,<13.0.0 | ||||
| cuda-autodetect = | ||||
|     cupy-wheel>=11.0.0,<13.0.0 | ||||
| apple = | ||||
|  |  | |||
							
								
								
									
										32
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -1,10 +1,9 @@ | |||
| #!/usr/bin/env python | ||||
| from setuptools import Extension, setup, find_packages | ||||
| import sys | ||||
| import platform | ||||
| import numpy | ||||
| from distutils.command.build_ext import build_ext | ||||
| from distutils.sysconfig import get_python_inc | ||||
| from setuptools.command.build_ext import build_ext | ||||
| from sysconfig import get_path | ||||
| from pathlib import Path | ||||
| import shutil | ||||
| from Cython.Build import cythonize | ||||
|  | @ -80,6 +79,7 @@ COMPILER_DIRECTIVES = { | |||
|     "language_level": -3, | ||||
|     "embedsignature": True, | ||||
|     "annotation_typing": False, | ||||
|     "profile": sys.version_info < (3, 12), | ||||
| } | ||||
| # Files to copy into the package that are otherwise not included | ||||
| COPY_FILES = { | ||||
|  | @ -89,30 +89,6 @@ COPY_FILES = { | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| def is_new_osx(): | ||||
|     """Check whether we're on OSX >= 10.7""" | ||||
|     if sys.platform != "darwin": | ||||
|         return False | ||||
|     mac_ver = platform.mac_ver()[0] | ||||
|     if mac_ver.startswith("10"): | ||||
|         minor_version = int(mac_ver.split(".")[1]) | ||||
|         if minor_version >= 7: | ||||
|             return True | ||||
|         else: | ||||
|             return False | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| if is_new_osx(): | ||||
|     # On Mac, use libc++ because Apple deprecated use of | ||||
|     # libstdc | ||||
|     COMPILE_OPTIONS["other"].append("-stdlib=libc++") | ||||
|     LINK_OPTIONS["other"].append("-lc++") | ||||
|     # g++ (used by unix compiler on mac) links to libstdc++ as a default lib. | ||||
|     # See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc | ||||
|     LINK_OPTIONS["other"].append("-nodefaultlibs") | ||||
| 
 | ||||
| 
 | ||||
| # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options | ||||
| # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used | ||||
| class build_ext_options: | ||||
|  | @ -205,7 +181,7 @@ def setup_package(): | |||
| 
 | ||||
|     include_dirs = [ | ||||
|         numpy.get_include(), | ||||
|         get_python_inc(plat_specific=True), | ||||
|         get_path("include"), | ||||
|     ] | ||||
|     ext_modules = [] | ||||
|     ext_modules.append( | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| # cython: profile=False | ||||
| from .errors import Errors | ||||
| 
 | ||||
| IOB_STRINGS = ("", "I", "O", "B") | ||||
|  |  | |||
|  | @ -14,6 +14,7 @@ from .debug_diff import debug_diff  # noqa: F401 | |||
| from .debug_model import debug_model  # noqa: F401 | ||||
| from .download import download  # noqa: F401 | ||||
| from .evaluate import evaluate  # noqa: F401 | ||||
| from .find_function import find_function  # noqa: F401 | ||||
| from .find_threshold import find_threshold  # noqa: F401 | ||||
| from .info import info  # noqa: F401 | ||||
| from .init_config import fill_config, init_config  # noqa: F401 | ||||
|  | @ -21,15 +22,17 @@ from .init_pipeline import init_pipeline_cli  # noqa: F401 | |||
| from .package import package  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .profile import profile  # noqa: F401 | ||||
| from .project.assets import project_assets  # noqa: F401 | ||||
| from .project.clone import project_clone  # noqa: F401 | ||||
| from .project.document import project_document  # noqa: F401 | ||||
| from .project.dvc import project_update_dvc  # noqa: F401 | ||||
| from .project.pull import project_pull  # noqa: F401 | ||||
| from .project.push import project_push  # noqa: F401 | ||||
| from .project.run import project_run  # noqa: F401 | ||||
| from .train import train_cli  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.document import (  # type: ignore[attr-defined]  # noqa: F401 | ||||
|     project_document, | ||||
| ) | ||||
| from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .validate import validate  # type: ignore[attr-defined]  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||
|  |  | |||
|  | @ -26,10 +26,11 @@ from thinc.api import Config, ConfigValidationError, require_gpu | |||
| from thinc.util import gpu_is_available | ||||
| from typer.main import get_command | ||||
| from wasabi import Printer, msg | ||||
| from weasel import app as project_cli | ||||
| 
 | ||||
| from .. import about | ||||
| from ..errors import RENAMED_LANGUAGE_CODES | ||||
| from ..schemas import ProjectConfigSchema, validate | ||||
| from ..schemas import validate | ||||
| from ..util import ( | ||||
|     ENV_VARS, | ||||
|     SimpleFrozenDict, | ||||
|  | @ -41,15 +42,10 @@ from ..util import ( | |||
|     run_command, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pathy import FluidPath  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| SDIST_SUFFIX = ".tar.gz" | ||||
| WHEEL_SUFFIX = "-py3-none-any.whl" | ||||
| 
 | ||||
| PROJECT_FILE = "project.yml" | ||||
| PROJECT_LOCK = "project.lock" | ||||
| COMMAND = "python -m spacy" | ||||
| NAME = "spacy" | ||||
| HELP = """spaCy Command-line Interface | ||||
|  | @ -75,11 +71,10 @@ Opt = typer.Option | |||
| 
 | ||||
| app = typer.Typer(name=NAME, help=HELP) | ||||
| benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True) | ||||
| project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) | ||||
| debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) | ||||
| init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) | ||||
| 
 | ||||
| app.add_typer(project_cli) | ||||
| app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True) | ||||
| app.add_typer(debug_cli) | ||||
| app.add_typer(benchmark_cli) | ||||
| app.add_typer(init_cli) | ||||
|  | @ -164,148 +159,6 @@ def _handle_renamed_language_codes(lang: Optional[str]) -> None: | |||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def load_project_config( | ||||
|     path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() | ||||
| ) -> Dict[str, Any]: | ||||
|     """Load the project.yml file from a directory and validate it. Also make | ||||
|     sure that all directories defined in the config exist. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     interpolate (bool): Whether to substitute project variables. | ||||
|     overrides (Dict[str, Any]): Optional config overrides. | ||||
|     RETURNS (Dict[str, Any]): The loaded project.yml. | ||||
|     """ | ||||
|     config_path = path / PROJECT_FILE | ||||
|     if not config_path.exists(): | ||||
|         msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1) | ||||
|     invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct." | ||||
|     try: | ||||
|         config = srsly.read_yaml(config_path) | ||||
|     except ValueError as e: | ||||
|         msg.fail(invalid_err, e, exits=1) | ||||
|     errors = validate(ProjectConfigSchema, config) | ||||
|     if errors: | ||||
|         msg.fail(invalid_err) | ||||
|         print("\n".join(errors)) | ||||
|         sys.exit(1) | ||||
|     validate_project_version(config) | ||||
|     validate_project_commands(config) | ||||
|     if interpolate: | ||||
|         err = f"{PROJECT_FILE} validation error" | ||||
|         with show_validation_error(title=err, hint_fill=False): | ||||
|             config = substitute_project_variables(config, overrides) | ||||
|     # Make sure directories defined in config exist | ||||
|     for subdir in config.get("directories", []): | ||||
|         dir_path = path / subdir | ||||
|         if not dir_path.exists(): | ||||
|             dir_path.mkdir(parents=True) | ||||
|     return config | ||||
| 
 | ||||
| 
 | ||||
| def substitute_project_variables( | ||||
|     config: Dict[str, Any], | ||||
|     overrides: Dict[str, Any] = SimpleFrozenDict(), | ||||
|     key: str = "vars", | ||||
|     env_key: str = "env", | ||||
| ) -> Dict[str, Any]: | ||||
|     """Interpolate variables in the project file using the config system. | ||||
| 
 | ||||
|     config (Dict[str, Any]): The project config. | ||||
|     overrides (Dict[str, Any]): Optional config overrides. | ||||
|     key (str): Key containing variables in project config. | ||||
|     env_key (str): Key containing environment variable mapping in project config. | ||||
|     RETURNS (Dict[str, Any]): The interpolated project config. | ||||
|     """ | ||||
|     config.setdefault(key, {}) | ||||
|     config.setdefault(env_key, {}) | ||||
|     # Substitute references to env vars with their values | ||||
|     for config_var, env_var in config[env_key].items(): | ||||
|         config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) | ||||
|     # Need to put variables in the top scope again so we can have a top-level | ||||
|     # section "project" (otherwise, a list of commands in the top scope wouldn't) | ||||
|     # be allowed by Thinc's config system | ||||
|     cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) | ||||
|     cfg = Config().from_str(cfg.to_str(), overrides=overrides) | ||||
|     interpolated = cfg.interpolate() | ||||
|     return dict(interpolated["project"]) | ||||
| 
 | ||||
| 
 | ||||
| def validate_project_version(config: Dict[str, Any]) -> None: | ||||
|     """If the project defines a compatible spaCy version range, chec that it's | ||||
|     compatible with the current version of spaCy. | ||||
| 
 | ||||
|     config (Dict[str, Any]): The loaded config. | ||||
|     """ | ||||
|     spacy_version = config.get("spacy_version", None) | ||||
|     if spacy_version and not is_compatible_version(about.__version__, spacy_version): | ||||
|         err = ( | ||||
|             f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) " | ||||
|             f"that's not compatible with the version of spaCy you're running " | ||||
|             f"({about.__version__}). You can edit version requirement in the " | ||||
|             f"{PROJECT_FILE} to load it, but the project may not run as expected." | ||||
|         ) | ||||
|         msg.fail(err, exits=1) | ||||
| 
 | ||||
| 
 | ||||
| def validate_project_commands(config: Dict[str, Any]) -> None: | ||||
|     """Check that project commands and workflows are valid, don't contain | ||||
|     duplicates, don't clash  and only refer to commands that exist. | ||||
| 
 | ||||
|     config (Dict[str, Any]): The loaded config. | ||||
|     """ | ||||
|     command_names = [cmd["name"] for cmd in config.get("commands", [])] | ||||
|     workflows = config.get("workflows", {}) | ||||
|     duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1]) | ||||
|     if duplicates: | ||||
|         err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}" | ||||
|         msg.fail(err, exits=1) | ||||
|     for workflow_name, workflow_steps in workflows.items(): | ||||
|         if workflow_name in command_names: | ||||
|             err = f"Can't use workflow name '{workflow_name}': name already exists as a command" | ||||
|             msg.fail(err, exits=1) | ||||
|         for step in workflow_steps: | ||||
|             if step not in command_names: | ||||
|                 msg.fail( | ||||
|                     f"Unknown command specified in workflow '{workflow_name}': {step}", | ||||
|                     f"Workflows can only refer to commands defined in the 'commands' " | ||||
|                     f"section of the {PROJECT_FILE}.", | ||||
|                     exits=1, | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
| def get_hash(data, exclude: Iterable[str] = tuple()) -> str: | ||||
|     """Get the hash for a JSON-serializable object. | ||||
| 
 | ||||
|     data: The data to hash. | ||||
|     exclude (Iterable[str]): Top-level keys to exclude if data is a dict. | ||||
|     RETURNS (str): The hash. | ||||
|     """ | ||||
|     if isinstance(data, dict): | ||||
|         data = {k: v for k, v in data.items() if k not in exclude} | ||||
|     data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8") | ||||
|     return hashlib.md5(data_str).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| def get_checksum(path: Union[Path, str]) -> str: | ||||
|     """Get the checksum for a file or directory given its file path. If a | ||||
|     directory path is provided, this uses all files in that directory. | ||||
| 
 | ||||
|     path (Union[Path, str]): The file or directory path. | ||||
|     RETURNS (str): The checksum. | ||||
|     """ | ||||
|     path = Path(path) | ||||
|     if not (path.is_file() or path.is_dir()): | ||||
|         msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1) | ||||
|     if path.is_file(): | ||||
|         return hashlib.md5(Path(path).read_bytes()).hexdigest() | ||||
|     else: | ||||
|         # TODO: this is currently pretty slow | ||||
|         dir_checksum = hashlib.md5() | ||||
|         for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()): | ||||
|             dir_checksum.update(sub_file.read_bytes()) | ||||
|         return dir_checksum.hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| @contextmanager | ||||
| def show_validation_error( | ||||
|     file_path: Optional[Union[str, Path]] = None, | ||||
|  | @ -370,166 +223,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None: | |||
|             msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) | ||||
| 
 | ||||
| 
 | ||||
| def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None: | ||||
|     """Upload a file. | ||||
| 
 | ||||
|     src (Path): The source path. | ||||
|     url (str): The destination URL to upload to. | ||||
|     """ | ||||
|     import smart_open | ||||
| 
 | ||||
|     # Create parent directories for local paths | ||||
|     if isinstance(dest, Path): | ||||
|         if not dest.parent.exists(): | ||||
|             dest.parent.mkdir(parents=True) | ||||
| 
 | ||||
|     dest = str(dest) | ||||
|     with smart_open.open(dest, mode="wb") as output_file: | ||||
|         with src.open(mode="rb") as input_file: | ||||
|             output_file.write(input_file.read()) | ||||
| 
 | ||||
| 
 | ||||
| def download_file( | ||||
|     src: Union[str, "FluidPath"], dest: Path, *, force: bool = False | ||||
| ) -> None: | ||||
|     """Download a file using smart_open. | ||||
| 
 | ||||
|     url (str): The URL of the file. | ||||
|     dest (Path): The destination path. | ||||
|     force (bool): Whether to force download even if file exists. | ||||
|         If False, the download will be skipped. | ||||
|     """ | ||||
|     import smart_open | ||||
| 
 | ||||
|     if dest.exists() and not force: | ||||
|         return None | ||||
|     src = str(src) | ||||
|     with smart_open.open(src, mode="rb", compression="disable") as input_file: | ||||
|         with dest.open(mode="wb") as output_file: | ||||
|             shutil.copyfileobj(input_file, output_file) | ||||
| 
 | ||||
| 
 | ||||
| def ensure_pathy(path): | ||||
|     """Temporary helper to prevent importing Pathy globally (which can cause | ||||
|     slow and annoying Google Cloud warning).""" | ||||
|     from pathy import Pathy  # noqa: F811 | ||||
| 
 | ||||
|     return Pathy.fluid(path) | ||||
| 
 | ||||
| 
 | ||||
| def git_checkout( | ||||
|     repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False | ||||
| ): | ||||
|     git_version = get_git_version() | ||||
|     if dest.exists(): | ||||
|         msg.fail("Destination of checkout must not exist", exits=1) | ||||
|     if not dest.parent.exists(): | ||||
|         msg.fail("Parent of destination of checkout must exist", exits=1) | ||||
|     if sparse and git_version >= (2, 22): | ||||
|         return git_sparse_checkout(repo, subpath, dest, branch) | ||||
|     elif sparse: | ||||
|         # Only show warnings if the user explicitly wants sparse checkout but | ||||
|         # the Git version doesn't support it | ||||
|         err_old = ( | ||||
|             f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " | ||||
|             f"that doesn't fully support sparse checkout yet." | ||||
|         ) | ||||
|         err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." | ||||
|         msg.warn( | ||||
|             f"{err_unk if git_version == (0, 0) else err_old} " | ||||
|             f"This means that more files than necessary may be downloaded " | ||||
|             f"temporarily. To only download the files needed, make sure " | ||||
|             f"you're using Git v2.22 or above." | ||||
|         ) | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" | ||||
|         run_command(cmd, capture=True) | ||||
|         # We need Path(name) to make sure we also support subdirectories | ||||
|         try: | ||||
|             source_path = tmp_dir / Path(subpath) | ||||
|             if not is_subpath_of(tmp_dir, source_path): | ||||
|                 err = f"'{subpath}' is a path outside of the cloned repository." | ||||
|                 msg.fail(err, repo, exits=1) | ||||
|             shutil.copytree(str(source_path), str(dest)) | ||||
|         except FileNotFoundError: | ||||
|             err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')" | ||||
|             msg.fail(err, repo, exits=1) | ||||
| 
 | ||||
| 
 | ||||
| def git_sparse_checkout(repo, subpath, dest, branch): | ||||
|     # We're using Git, partial clone and sparse checkout to | ||||
|     # only clone the files we need | ||||
|     # This ends up being RIDICULOUS. omg. | ||||
|     # So, every tutorial and SO post talks about 'sparse checkout'...But they | ||||
|     # go and *clone* the whole repo. Worthless. And cloning part of a repo | ||||
|     # turns out to be completely broken. The only way to specify a "path" is.. | ||||
|     # a path *on the server*? The contents of which, specifies the paths. Wat. | ||||
|     # Obviously this is hopelessly broken and insecure, because you can query | ||||
|     # arbitrary paths on the server! So nobody enables this. | ||||
|     # What we have to do is disable *all* files. We could then just checkout | ||||
|     # the path, and it'd "work", but be hopelessly slow...Because it goes and | ||||
|     # transfers every missing object one-by-one. So the final piece is that we | ||||
|     # need to use some weird git internals to fetch the missings in bulk, and | ||||
|     # *that* we can do by path. | ||||
|     # We're using Git and sparse checkout to only clone the files we need | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         # This is the "clone, but don't download anything" part. | ||||
|         cmd = ( | ||||
|             f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " | ||||
|             f"-b {branch} --filter=blob:none" | ||||
|         ) | ||||
|         run_command(cmd) | ||||
|         # Now we need to find the missing filenames for the subpath we want. | ||||
|         # Looking for this 'rev-list' command in the git --help? Hah. | ||||
|         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" | ||||
|         ret = run_command(cmd, capture=True) | ||||
|         git_repo = _http_to_git(repo) | ||||
|         # Now pass those missings into another bit of git internals | ||||
|         missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) | ||||
|         if not missings: | ||||
|             err = ( | ||||
|                 f"Could not find any relevant files for '{subpath}'. " | ||||
|                 f"Did you specify a correct and complete path within repo '{repo}' " | ||||
|                 f"and branch {branch}?" | ||||
|             ) | ||||
|             msg.fail(err, exits=1) | ||||
|         cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}" | ||||
|         run_command(cmd, capture=True) | ||||
|         # And finally, we can checkout our subpath | ||||
|         cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" | ||||
|         run_command(cmd, capture=True) | ||||
| 
 | ||||
|         # Get a subdirectory of the cloned path, if appropriate | ||||
|         source_path = tmp_dir / Path(subpath) | ||||
|         if not is_subpath_of(tmp_dir, source_path): | ||||
|             err = f"'{subpath}' is a path outside of the cloned repository." | ||||
|             msg.fail(err, repo, exits=1) | ||||
| 
 | ||||
|         shutil.move(str(source_path), str(dest)) | ||||
| 
 | ||||
| 
 | ||||
| def git_repo_branch_exists(repo: str, branch: str) -> bool: | ||||
|     """Uses 'git ls-remote' to check if a repository and branch exists | ||||
| 
 | ||||
|     repo (str): URL to get repo. | ||||
|     branch (str): Branch on repo to check. | ||||
|     RETURNS (bool): True if repo:branch exists. | ||||
|     """ | ||||
|     get_git_version() | ||||
|     cmd = f"git ls-remote {repo} {branch}" | ||||
|     # We might be tempted to use `--exit-code` with `git ls-remote`, but | ||||
|     # `run_command` handles the `returncode` for us, so we'll rely on | ||||
|     # the fact that stdout returns '' if the requested branch doesn't exist | ||||
|     ret = run_command(cmd, capture=True) | ||||
|     exists = ret.stdout != "" | ||||
|     return exists | ||||
| 
 | ||||
| 
 | ||||
| def get_git_version( | ||||
|     error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", | ||||
| ) -> Tuple[int, int]: | ||||
|     """Get the version of git and raise an error if calling 'git --version' fails. | ||||
| 
 | ||||
|     error (str): The error message to show. | ||||
|     RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns | ||||
|         (0, 0) if the version couldn't be determined. | ||||
|  | @ -545,30 +242,6 @@ def get_git_version( | |||
|     return int(version[0]), int(version[1]) | ||||
| 
 | ||||
| 
 | ||||
| def _http_to_git(repo: str) -> str: | ||||
|     if repo.startswith("http://"): | ||||
|         repo = repo.replace(r"http://", r"https://") | ||||
|     if repo.startswith(r"https://"): | ||||
|         repo = repo.replace("https://", "git@").replace("/", ":", 1) | ||||
|         if repo.endswith("/"): | ||||
|             repo = repo[:-1] | ||||
|         repo = f"{repo}.git" | ||||
|     return repo | ||||
| 
 | ||||
| 
 | ||||
| def is_subpath_of(parent, child): | ||||
|     """ | ||||
|     Check whether `child` is a path contained within `parent`. | ||||
|     """ | ||||
|     # Based on https://stackoverflow.com/a/37095733 . | ||||
| 
 | ||||
|     # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so | ||||
|     # we can stop using crusty old os.path functions. | ||||
|     parent_realpath = os.path.realpath(parent) | ||||
|     child_realpath = os.path.realpath(child) | ||||
|     return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath | ||||
| 
 | ||||
| 
 | ||||
| @overload | ||||
| def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: | ||||
|     ... | ||||
|  |  | |||
|  | @ -133,7 +133,9 @@ def apply( | |||
|     if len(text_files) > 0: | ||||
|         streams.append(_stream_texts(text_files)) | ||||
|     datagen = cast(DocOrStrStream, chain(*streams)) | ||||
|     for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)): | ||||
|     for doc in tqdm.tqdm( | ||||
|         nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None | ||||
|     ): | ||||
|         docbin.add(doc) | ||||
|     if output_file.suffix == "": | ||||
|         output_file = output_file.with_suffix(".spacy") | ||||
|  |  | |||
|  | @ -40,7 +40,8 @@ def assemble_cli( | |||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#assemble | ||||
|     """ | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|     if verbose: | ||||
|         util.logger.setLevel(logging.DEBUG) | ||||
|     # Make sure all files and paths exists if they are needed | ||||
|     if not config_path or (str(config_path) != "-" and not config_path.exists()): | ||||
|         msg.fail("Config file not found", config_path, exits=1) | ||||
|  |  | |||
|  | @ -89,7 +89,7 @@ class Quartiles: | |||
| def annotate( | ||||
|     nlp: Language, docs: List[Doc], batch_size: Optional[int] | ||||
| ) -> numpy.ndarray: | ||||
|     docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size) | ||||
|     docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size) | ||||
|     wps = [] | ||||
|     while True: | ||||
|         with time_context() as elapsed: | ||||
|  |  | |||
|  | @ -10,6 +10,8 @@ from ..util import ( | |||
|     get_installed_models, | ||||
|     get_minor_version, | ||||
|     get_package_version, | ||||
|     is_in_interactive, | ||||
|     is_in_jupyter, | ||||
|     is_package, | ||||
|     is_prerelease_version, | ||||
|     run_command, | ||||
|  | @ -85,6 +87,27 @@ def download( | |||
|         "Download and installation successful", | ||||
|         f"You can now load the package via spacy.load('{model_name}')", | ||||
|     ) | ||||
|     if is_in_jupyter(): | ||||
|         reload_deps_msg = ( | ||||
|             "If you are in a Jupyter or Colab notebook, you may need to " | ||||
|             "restart Python in order to load all the package's dependencies. " | ||||
|             "You can do this by selecting the 'Restart kernel' or 'Restart " | ||||
|             "runtime' option." | ||||
|         ) | ||||
|         msg.warn( | ||||
|             "Restart to reload dependencies", | ||||
|             reload_deps_msg, | ||||
|         ) | ||||
|     elif is_in_interactive(): | ||||
|         reload_deps_msg = ( | ||||
|             "If you are in an interactive Python session, you may need to " | ||||
|             "exit and restart Python to load all the package's dependencies. " | ||||
|             "You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)." | ||||
|         ) | ||||
|         msg.warn( | ||||
|             "Restart to reload dependencies", | ||||
|             reload_deps_msg, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ def evaluate_cli( | |||
|     displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), | ||||
|     displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), | ||||
|     per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."), | ||||
|     spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|  | @ -53,6 +54,7 @@ def evaluate_cli( | |||
|         displacy_limit=displacy_limit, | ||||
|         per_component=per_component, | ||||
|         silent=False, | ||||
|         spans_key=spans_key, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										69
									
								
								spacy/cli/find_function.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								spacy/cli/find_function.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,69 @@ | |||
| from typing import Optional, Tuple | ||||
| 
 | ||||
| from catalogue import RegistryError | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ..util import registry | ||||
| from ._util import Arg, Opt, app | ||||
| 
 | ||||
| 
 | ||||
| @app.command("find-function") | ||||
| def find_function_cli( | ||||
|     # fmt: off | ||||
|     func_name: str = Arg(..., help="Name of the registered function."), | ||||
|     registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|     Find the module, path and line number to the file the registered | ||||
|     function is defined in, if available. | ||||
| 
 | ||||
|     func_name (str): Name of the registered function. | ||||
|     registry_name (Optional[str]): Name of the catalogue registry. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#find-function | ||||
|     """ | ||||
|     if not registry_name: | ||||
|         registry_names = registry.get_registry_names() | ||||
|         for name in registry_names: | ||||
|             if registry.has(name, func_name): | ||||
|                 registry_name = name | ||||
|                 break | ||||
| 
 | ||||
|     if not registry_name: | ||||
|         msg.fail( | ||||
|             f"Couldn't find registered function: '{func_name}'", | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
|     assert registry_name is not None | ||||
|     find_function(func_name, registry_name) | ||||
| 
 | ||||
| 
 | ||||
| def find_function(func_name: str, registry_name: str) -> Tuple[str, int]: | ||||
|     registry_desc = None | ||||
|     try: | ||||
|         registry_desc = registry.find(registry_name, func_name) | ||||
|     except RegistryError as e: | ||||
|         msg.fail( | ||||
|             f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'", | ||||
|         ) | ||||
|         msg.fail(f"{e}", exits=1) | ||||
|     assert registry_desc is not None | ||||
| 
 | ||||
|     registry_path = None | ||||
|     line_no = None | ||||
|     if registry_desc["file"]: | ||||
|         registry_path = registry_desc["file"] | ||||
|         line_no = registry_desc["line_no"] | ||||
| 
 | ||||
|     if not registry_path or not line_no: | ||||
|         msg.fail( | ||||
|             f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'", | ||||
|             exits=1, | ||||
|         ) | ||||
|     assert registry_path is not None | ||||
|     assert line_no is not None | ||||
| 
 | ||||
|     msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}") | ||||
|     return str(registry_path), int(line_no) | ||||
|  | @ -52,8 +52,8 @@ def find_threshold_cli( | |||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#find-threshold | ||||
|     """ | ||||
| 
 | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|     if verbose: | ||||
|         util.logger.setLevel(logging.DEBUG) | ||||
|     import_code(code_path) | ||||
|     find_threshold( | ||||
|         model=model, | ||||
|  |  | |||
|  | @ -90,7 +90,8 @@ def init_pipeline_cli( | |||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU") | ||||
|     # fmt: on | ||||
| ): | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|     if verbose: | ||||
|         util.logger.setLevel(logging.DEBUG) | ||||
|     overrides = parse_config_overrides(ctx.args) | ||||
|     import_code(code_path) | ||||
|     setup_gpu(use_gpu) | ||||
|  | @ -119,7 +120,8 @@ def init_labels_cli( | |||
|     """Generate JSON files for the labels in the data. This helps speed up the | ||||
|     training process, since spaCy won't have to preprocess the data to | ||||
|     extract the labels.""" | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|     if verbose: | ||||
|         util.logger.setLevel(logging.DEBUG) | ||||
|     if not output_path.exists(): | ||||
|         output_path.mkdir(parents=True) | ||||
|     overrides = parse_config_overrides(ctx.args) | ||||
|  |  | |||
|  | @ -1,5 +1,8 @@ | |||
| import importlib.metadata | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import sys | ||||
| from collections import defaultdict | ||||
| from pathlib import Path | ||||
|  | @ -35,7 +38,7 @@ def package_cli( | |||
|     specified output directory, and the data will be copied over. If | ||||
|     --create-meta is set and a meta.json already exists in the output directory, | ||||
|     the existing values will be used as the defaults in the command-line prompt. | ||||
|     After packaging, "python setup.py sdist" is run in the package directory, | ||||
|     After packaging, "python -m build --sdist" is run in the package directory, | ||||
|     which will create a .tar.gz archive that can be installed via "pip install". | ||||
| 
 | ||||
|     If additional code files are provided (e.g. Python files containing custom | ||||
|  | @ -78,9 +81,17 @@ def package( | |||
|     input_path = util.ensure_path(input_dir) | ||||
|     output_path = util.ensure_path(output_dir) | ||||
|     meta_path = util.ensure_path(meta_path) | ||||
|     if create_wheel and not has_wheel(): | ||||
|         err = "Generating a binary .whl file requires wheel to be installed" | ||||
|         msg.fail(err, "pip install wheel", exits=1) | ||||
|     if create_wheel and not has_wheel() and not has_build(): | ||||
|         err = ( | ||||
|             "Generating wheels requires 'build' or 'wheel' (deprecated) to be installed" | ||||
|         ) | ||||
|         msg.fail(err, "pip install build", exits=1) | ||||
|     if not has_build(): | ||||
|         msg.warn( | ||||
|             "Generating packages without the 'build' package is deprecated and " | ||||
|             "will not be supported in the future. To install 'build': pip " | ||||
|             "install build" | ||||
|         ) | ||||
|     if not input_path or not input_path.exists(): | ||||
|         msg.fail("Can't locate pipeline data", input_path, exits=1) | ||||
|     if not output_path or not output_path.exists(): | ||||
|  | @ -184,12 +195,37 @@ def package( | |||
|     msg.good(f"Successfully created package directory '{model_name_v}'", main_path) | ||||
|     if create_sdist: | ||||
|         with util.working_dir(main_path): | ||||
|             util.run_command([sys.executable, "setup.py", "sdist"], capture=False) | ||||
|             # run directly, since util.run_command is not designed to continue | ||||
|             # after a command fails | ||||
|             ret = subprocess.run( | ||||
|                 [sys.executable, "-m", "build", ".", "--sdist"], | ||||
|                 env=os.environ.copy(), | ||||
|             ) | ||||
|             if ret.returncode != 0: | ||||
|                 msg.warn( | ||||
|                     "Creating sdist with 'python -m build' failed. Falling " | ||||
|                     "back to deprecated use of 'python setup.py sdist'" | ||||
|                 ) | ||||
|                 util.run_command([sys.executable, "setup.py", "sdist"], capture=False) | ||||
|         zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" | ||||
|         msg.good(f"Successfully created zipped Python package", zip_file) | ||||
|     if create_wheel: | ||||
|         with util.working_dir(main_path): | ||||
|             util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) | ||||
|             # run directly, since util.run_command is not designed to continue | ||||
|             # after a command fails | ||||
|             ret = subprocess.run( | ||||
|                 [sys.executable, "-m", "build", ".", "--wheel"], | ||||
|                 env=os.environ.copy(), | ||||
|             ) | ||||
|             if ret.returncode != 0: | ||||
|                 msg.warn( | ||||
|                     "Creating wheel with 'python -m build' failed. Falling " | ||||
|                     "back to deprecated use of 'wheel' with " | ||||
|                     "'python setup.py bdist_wheel'" | ||||
|                 ) | ||||
|                 util.run_command( | ||||
|                     [sys.executable, "setup.py", "bdist_wheel"], capture=False | ||||
|                 ) | ||||
|         wheel_name_squashed = re.sub("_+", "_", model_name_v) | ||||
|         wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" | ||||
|         msg.good(f"Successfully created binary wheel", wheel) | ||||
|  | @ -209,6 +245,17 @@ def has_wheel() -> bool: | |||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def has_build() -> bool: | ||||
|     # it's very likely that there is a local directory named build/ (especially | ||||
|     # in an editable install), so an import check is not sufficient; instead | ||||
|     # check that there is a package version | ||||
|     try: | ||||
|         importlib.metadata.version("build") | ||||
|         return True | ||||
|     except importlib.metadata.PackageNotFoundError:  # type: ignore[attr-defined] | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def get_third_party_dependencies( | ||||
|     config: Config, exclude: List[str] = util.SimpleFrozenList() | ||||
| ) -> List[str]: | ||||
|  | @ -403,7 +450,7 @@ def _format_sources(data: Any) -> str: | |||
|         if author: | ||||
|             result += " ({})".format(author) | ||||
|         sources.append(result) | ||||
|     return "<br />".join(sources) | ||||
|     return "<br>".join(sources) | ||||
| 
 | ||||
| 
 | ||||
| def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str: | ||||
|  |  | |||
|  | @ -71,7 +71,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> | |||
| 
 | ||||
| 
 | ||||
| def parse_texts(nlp: Language, texts: Sequence[str]) -> None: | ||||
|     for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): | ||||
|     for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16): | ||||
|         pass | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,217 +1 @@ | |||
| import os | ||||
| import re | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Optional | ||||
| 
 | ||||
| import requests | ||||
| import typer | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ...util import ensure_path, working_dir | ||||
| from .._util import ( | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     SimpleFrozenDict, | ||||
|     download_file, | ||||
|     get_checksum, | ||||
|     get_git_version, | ||||
|     git_checkout, | ||||
|     load_project_config, | ||||
|     parse_config_overrides, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| # Whether assets are extra if `extra` is not set. | ||||
| EXTRA_DEFAULT = False | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command( | ||||
|     "assets", | ||||
|     context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, | ||||
| ) | ||||
| def project_assets_cli( | ||||
|     # fmt: off | ||||
|     ctx: typer.Context,  # This is only used to read additional arguments | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."), | ||||
|     extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Fetch project assets like datasets and pretrained weights. Assets are | ||||
|     defined in the "assets" section of the project.yml. If a checksum is | ||||
|     provided in the project.yml, the file is only downloaded if no local file | ||||
|     with the same checksum exists. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-assets | ||||
|     """ | ||||
|     overrides = parse_config_overrides(ctx.args) | ||||
|     project_assets( | ||||
|         project_dir, | ||||
|         overrides=overrides, | ||||
|         sparse_checkout=sparse_checkout, | ||||
|         extra=extra, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def project_assets( | ||||
|     project_dir: Path, | ||||
|     *, | ||||
|     overrides: Dict[str, Any] = SimpleFrozenDict(), | ||||
|     sparse_checkout: bool = False, | ||||
|     extra: bool = False, | ||||
| ) -> None: | ||||
|     """Fetch assets for a project using DVC if possible. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files | ||||
|                             needed. | ||||
|     extra (bool): Whether to download all assets, including those marked as 'extra'. | ||||
|     """ | ||||
|     project_path = ensure_path(project_dir) | ||||
|     config = load_project_config(project_path, overrides=overrides) | ||||
|     assets = [ | ||||
|         asset | ||||
|         for asset in config.get("assets", []) | ||||
|         if extra or not asset.get("extra", EXTRA_DEFAULT) | ||||
|     ] | ||||
|     if not assets: | ||||
|         msg.warn( | ||||
|             f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)", | ||||
|             exits=0, | ||||
|         ) | ||||
|     msg.info(f"Fetching {len(assets)} asset(s)") | ||||
| 
 | ||||
|     for asset in assets: | ||||
|         dest = (project_dir / asset["dest"]).resolve() | ||||
|         checksum = asset.get("checksum") | ||||
|         if "git" in asset: | ||||
|             git_err = ( | ||||
|                 f"Cloning spaCy project templates requires Git and the 'git' command. " | ||||
|                 f"Make sure it's installed and that the executable is available." | ||||
|             ) | ||||
|             get_git_version(error=git_err) | ||||
|             if dest.exists(): | ||||
|                 # If there's already a file, check for checksum | ||||
|                 if checksum and checksum == get_checksum(dest): | ||||
|                     msg.good( | ||||
|                         f"Skipping download with matching checksum: {asset['dest']}" | ||||
|                     ) | ||||
|                     continue | ||||
|                 else: | ||||
|                     if dest.is_dir(): | ||||
|                         shutil.rmtree(dest) | ||||
|                     else: | ||||
|                         dest.unlink() | ||||
|             if "repo" not in asset["git"] or asset["git"]["repo"] is None: | ||||
|                 msg.fail( | ||||
|                     "A git asset must include 'repo', the repository address.", exits=1 | ||||
|                 ) | ||||
|             if "path" not in asset["git"] or asset["git"]["path"] is None: | ||||
|                 msg.fail( | ||||
|                     "A git asset must include 'path' - use \"\" to get the entire repository.", | ||||
|                     exits=1, | ||||
|                 ) | ||||
|             git_checkout( | ||||
|                 asset["git"]["repo"], | ||||
|                 asset["git"]["path"], | ||||
|                 dest, | ||||
|                 branch=asset["git"].get("branch"), | ||||
|                 sparse=sparse_checkout, | ||||
|             ) | ||||
|             msg.good(f"Downloaded asset {dest}") | ||||
|         else: | ||||
|             url = asset.get("url") | ||||
|             if not url: | ||||
|                 # project.yml defines asset without URL that the user has to place | ||||
|                 check_private_asset(dest, checksum) | ||||
|                 continue | ||||
|             fetch_asset(project_path, url, dest, checksum) | ||||
| 
 | ||||
| 
 | ||||
| def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: | ||||
|     """Check and validate assets without a URL (private assets that the user | ||||
|     has to provide themselves) and give feedback about the checksum. | ||||
| 
 | ||||
|     dest (Path): Destination path of the asset. | ||||
|     checksum (Optional[str]): Optional checksum of the expected file. | ||||
|     """ | ||||
|     if not Path(dest).exists(): | ||||
|         err = f"No URL provided for asset. You need to add this file yourself: {dest}" | ||||
|         msg.warn(err) | ||||
|     else: | ||||
|         if not checksum: | ||||
|             msg.good(f"Asset already exists: {dest}") | ||||
|         elif checksum == get_checksum(dest): | ||||
|             msg.good(f"Asset exists with matching checksum: {dest}") | ||||
|         else: | ||||
|             msg.fail(f"Asset available but with incorrect checksum: {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def fetch_asset( | ||||
|     project_path: Path, url: str, dest: Path, checksum: Optional[str] = None | ||||
| ) -> None: | ||||
|     """Fetch an asset from a given URL or path. If a checksum is provided and a | ||||
|     local file exists, it's only re-downloaded if the checksum doesn't match. | ||||
| 
 | ||||
|     project_path (Path): Path to project directory. | ||||
|     url (str): URL or path to asset. | ||||
|     checksum (Optional[str]): Optional expected checksum of local file. | ||||
|     RETURNS (Optional[Path]): The path to the fetched asset or None if fetching | ||||
|         the asset failed. | ||||
|     """ | ||||
|     dest_path = (project_path / dest).resolve() | ||||
|     if dest_path.exists(): | ||||
|         # If there's already a file, check for checksum | ||||
|         if checksum: | ||||
|             if checksum == get_checksum(dest_path): | ||||
|                 msg.good(f"Skipping download with matching checksum: {dest}") | ||||
|                 return | ||||
|         else: | ||||
|             # If there's not a checksum, make sure the file is a possibly valid size | ||||
|             if os.path.getsize(dest_path) == 0: | ||||
|                 msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}") | ||||
|                 os.remove(dest_path) | ||||
|     # We might as well support the user here and create parent directories in | ||||
|     # case the asset dir isn't listed as a dir to create in the project.yml | ||||
|     if not dest_path.parent.exists(): | ||||
|         dest_path.parent.mkdir(parents=True) | ||||
|     with working_dir(project_path): | ||||
|         url = convert_asset_url(url) | ||||
|         try: | ||||
|             download_file(url, dest_path) | ||||
|             msg.good(f"Downloaded asset {dest}") | ||||
|         except requests.exceptions.RequestException as e: | ||||
|             if Path(url).exists() and Path(url).is_file(): | ||||
|                 # If it's a local file, copy to destination | ||||
|                 shutil.copy(url, str(dest_path)) | ||||
|                 msg.good(f"Copied local asset {dest}") | ||||
|             else: | ||||
|                 msg.fail(f"Download failed: {dest}", e) | ||||
|     if checksum and checksum != get_checksum(dest_path): | ||||
|         msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def convert_asset_url(url: str) -> str: | ||||
|     """Check and convert the asset URL if needed. | ||||
| 
 | ||||
|     url (str): The asset URL. | ||||
|     RETURNS (str): The converted URL. | ||||
|     """ | ||||
|     # If the asset URL is a regular GitHub URL it's likely a mistake | ||||
|     if ( | ||||
|         re.match(r"(http(s?)):\/\/github.com", url) | ||||
|         and "releases/download" not in url | ||||
|         and "/raw/" not in url | ||||
|     ): | ||||
|         converted = url.replace("github.com", "raw.githubusercontent.com") | ||||
|         converted = re.sub(r"/(tree|blob)/", "/", converted) | ||||
|         msg.warn( | ||||
|             "Downloading from a regular GitHub URL. This will only download " | ||||
|             "the source of the page, not the actual file. Converting the URL " | ||||
|             "to a raw URL.", | ||||
|             converted, | ||||
|         ) | ||||
|         return converted | ||||
|     return url | ||||
| from weasel.cli.assets import * | ||||
|  |  | |||
|  | @ -1,124 +1 @@ | |||
| import re | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ... import about | ||||
| from ...util import ensure_path | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_git_version, | ||||
|     git_checkout, | ||||
|     git_repo_branch_exists, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| DEFAULT_REPO = about.__projects__ | ||||
| DEFAULT_PROJECTS_BRANCH = about.__projects_branch__ | ||||
| DEFAULT_BRANCHES = ["main", "master"] | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("clone") | ||||
| def project_clone_cli( | ||||
|     # fmt: off | ||||
|     name: str = Arg(..., help="The name of the template to clone"), | ||||
|     dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), | ||||
|     repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"), | ||||
|     branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"), | ||||
|     sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Clone a project template from a repository. Calls into "git" and will | ||||
|     only download the files from the given subdirectory. The GitHub repo | ||||
|     defaults to the official spaCy template repo, but can be customized | ||||
|     (including using a private repo). | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-clone | ||||
|     """ | ||||
|     if dest is None: | ||||
|         dest = Path.cwd() / Path(name).parts[-1] | ||||
|     if repo == DEFAULT_REPO and branch is None: | ||||
|         branch = DEFAULT_PROJECTS_BRANCH | ||||
| 
 | ||||
|     if branch is None: | ||||
|         for default_branch in DEFAULT_BRANCHES: | ||||
|             if git_repo_branch_exists(repo, default_branch): | ||||
|                 branch = default_branch | ||||
|                 break | ||||
|         if branch is None: | ||||
|             default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES) | ||||
|             msg.fail( | ||||
|                 "No branch provided and attempted default " | ||||
|                 f"branches {default_branches_msg} do not exist.", | ||||
|                 exits=1, | ||||
|             ) | ||||
|     else: | ||||
|         if not git_repo_branch_exists(repo, branch): | ||||
|             msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1) | ||||
|     assert isinstance(branch, str) | ||||
|     project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout) | ||||
| 
 | ||||
| 
 | ||||
| def project_clone( | ||||
|     name: str, | ||||
|     dest: Path, | ||||
|     *, | ||||
|     repo: str = about.__projects__, | ||||
|     branch: str = about.__projects_branch__, | ||||
|     sparse_checkout: bool = False, | ||||
| ) -> None: | ||||
|     """Clone a project template from a repository. | ||||
| 
 | ||||
|     name (str): Name of subdirectory to clone. | ||||
|     dest (Path): Destination path of cloned project. | ||||
|     repo (str): URL of Git repo containing project templates. | ||||
|     branch (str): The branch to clone from | ||||
|     """ | ||||
|     dest = ensure_path(dest) | ||||
|     check_clone(name, dest, repo) | ||||
|     project_dir = dest.resolve() | ||||
|     repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) | ||||
|     try: | ||||
|         git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) | ||||
|     except subprocess.CalledProcessError: | ||||
|         err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')" | ||||
|         msg.fail(err, exits=1) | ||||
|     msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir) | ||||
|     if not (project_dir / PROJECT_FILE).exists(): | ||||
|         msg.warn(f"No {PROJECT_FILE} found in directory") | ||||
|     else: | ||||
|         msg.good(f"Your project is now ready!") | ||||
|         print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}") | ||||
| 
 | ||||
| 
 | ||||
| def check_clone(name: str, dest: Path, repo: str) -> None: | ||||
|     """Check and validate that the destination path can be used to clone. Will | ||||
|     check that Git is available and that the destination path is suitable. | ||||
| 
 | ||||
|     name (str): Name of the directory to clone from the repo. | ||||
|     dest (Path): Local destination of cloned directory. | ||||
|     repo (str): URL of the repo to clone from. | ||||
|     """ | ||||
|     git_err = ( | ||||
|         f"Cloning spaCy project templates requires Git and the 'git' command. " | ||||
|         f"To clone a project without Git, copy the files from the '{name}' " | ||||
|         f"directory in the {repo} to {dest} manually." | ||||
|     ) | ||||
|     get_git_version(error=git_err) | ||||
|     if not dest: | ||||
|         msg.fail(f"Not a valid directory to clone project: {dest}", exits=1) | ||||
|     if dest.exists(): | ||||
|         # Directory already exists (not allowed, clone needs to create it) | ||||
|         msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1) | ||||
|     if not dest.parent.exists(): | ||||
|         # We're not creating parents, parent dir should exist | ||||
|         msg.fail( | ||||
|             f"Can't clone project, parent directory doesn't exist: {dest.parent}. " | ||||
|             f"Create the necessary folder(s) first before continuing.", | ||||
|             exits=1, | ||||
|         ) | ||||
| from weasel.cli.clone import * | ||||
|  |  | |||
|  | @ -1,115 +1 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import MarkdownRenderer, msg | ||||
| 
 | ||||
| from ...util import working_dir | ||||
| from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli | ||||
| 
 | ||||
| DOCS_URL = "https://spacy.io" | ||||
| INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the | ||||
| project, as well as the available commands and workflows. For details, see the | ||||
| [spaCy projects documentation]({DOCS_URL}/usage/projects).""" | ||||
| INTRO_COMMANDS = f"""The following commands are defined by the project. They | ||||
| can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run). | ||||
| Commands are only re-run if their inputs have changed.""" | ||||
| INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They | ||||
| can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run) | ||||
| and will run the specified commands in order. Commands are only re-run if their | ||||
| inputs have changed.""" | ||||
| INTRO_ASSETS = f"""The following assets are defined by the project. They can | ||||
| be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets) | ||||
| in the project directory.""" | ||||
| # These markers are added to the Markdown and can be used to update the file in | ||||
| # place if it already exists. Only the auto-generated part will be replaced. | ||||
| MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->" | ||||
| MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->" | ||||
| # If this marker is used in an existing README, it's ignored and not replaced | ||||
| MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->" | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("document") | ||||
| def project_document_cli( | ||||
|     # fmt: off | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"), | ||||
|     no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|     Auto-generate a README.md for a project. If the content is saved to a file, | ||||
|     hidden markers are added so you can add custom content before or after the | ||||
|     auto-generated section and only the auto-generated docs will be replaced | ||||
|     when you re-run the command. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-document | ||||
|     """ | ||||
|     project_document(project_dir, output_file, no_emoji=no_emoji) | ||||
| 
 | ||||
| 
 | ||||
| def project_document( | ||||
|     project_dir: Path, output_file: Path, *, no_emoji: bool = False | ||||
| ) -> None: | ||||
|     is_stdout = str(output_file) == "-" | ||||
|     config = load_project_config(project_dir) | ||||
|     md = MarkdownRenderer(no_emoji=no_emoji) | ||||
|     md.add(MARKER_START) | ||||
|     title = config.get("title") | ||||
|     description = config.get("description") | ||||
|     md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐")) | ||||
|     if description: | ||||
|         md.add(description) | ||||
|     md.add(md.title(2, PROJECT_FILE, "📋")) | ||||
|     md.add(INTRO_PROJECT) | ||||
|     # Commands | ||||
|     cmds = config.get("commands", []) | ||||
|     data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds] | ||||
|     if data: | ||||
|         md.add(md.title(3, "Commands", "⏯")) | ||||
|         md.add(INTRO_COMMANDS) | ||||
|         md.add(md.table(data, ["Command", "Description"])) | ||||
|     # Workflows | ||||
|     wfs = config.get("workflows", {}).items() | ||||
|     data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs] | ||||
|     if data: | ||||
|         md.add(md.title(3, "Workflows", "⏭")) | ||||
|         md.add(INTRO_WORKFLOWS) | ||||
|         md.add(md.table(data, ["Workflow", "Steps"])) | ||||
|     # Assets | ||||
|     assets = config.get("assets", []) | ||||
|     data = [] | ||||
|     for a in assets: | ||||
|         source = "Git" if a.get("git") else "URL" if a.get("url") else "Local" | ||||
|         dest_path = a["dest"] | ||||
|         dest = md.code(dest_path) | ||||
|         if source == "Local": | ||||
|             # Only link assets if they're in the repo | ||||
|             with working_dir(project_dir) as p: | ||||
|                 if (p / dest_path).exists(): | ||||
|                     dest = md.link(dest, dest_path) | ||||
|         data.append((dest, source, a.get("description", ""))) | ||||
|     if data: | ||||
|         md.add(md.title(3, "Assets", "🗂")) | ||||
|         md.add(INTRO_ASSETS) | ||||
|         md.add(md.table(data, ["File", "Source", "Description"])) | ||||
|     md.add(MARKER_END) | ||||
|     # Output result | ||||
|     if is_stdout: | ||||
|         print(md.text) | ||||
|     else: | ||||
|         content = md.text | ||||
|         if output_file.exists(): | ||||
|             with output_file.open("r", encoding="utf8") as f: | ||||
|                 existing = f.read() | ||||
|             if MARKER_IGNORE in existing: | ||||
|                 msg.warn("Found ignore marker in existing file: skipping", output_file) | ||||
|                 return | ||||
|             if MARKER_START in existing and MARKER_END in existing: | ||||
|                 msg.info("Found existing file: only replacing auto-generated docs") | ||||
|                 before = existing.split(MARKER_START)[0] | ||||
|                 after = existing.split(MARKER_END)[1] | ||||
|                 content = f"{before}{content}{after}" | ||||
|             else: | ||||
|                 msg.warn("Replacing existing file") | ||||
|         with output_file.open("w", encoding="utf8") as f: | ||||
|             f.write(content) | ||||
|         msg.good("Saved project documentation", output_file) | ||||
| from weasel.cli.document import * | ||||
|  |  | |||
|  | @ -1,220 +1 @@ | |||
| """This module contains helpers and subcommands for integrating spaCy projects | ||||
| with Data Version Controk (DVC). https://dvc.org""" | ||||
| import subprocess | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, List, Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ...util import ( | ||||
|     SimpleFrozenList, | ||||
|     join_command, | ||||
|     run_command, | ||||
|     split_command, | ||||
|     working_dir, | ||||
| ) | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     NAME, | ||||
|     PROJECT_FILE, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_hash, | ||||
|     load_project_config, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| DVC_CONFIG = "dvc.yaml" | ||||
| DVC_DIR = ".dvc" | ||||
| UPDATE_COMMAND = "dvc" | ||||
| DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've | ||||
| # edited your {PROJECT_FILE}, you can regenerate this file by running: | ||||
| # {COMMAND} project {UPDATE_COMMAND}""" | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command(UPDATE_COMMAND) | ||||
| def project_update_dvc_cli( | ||||
|     # fmt: off | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), | ||||
|     quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Auto-generate Data Version Control (DVC) config. A DVC | ||||
|     project can only define one pipeline, so you need to specify one workflow | ||||
|     defined in the project.yml. If no workflow is specified, the first defined | ||||
|     workflow is used. The DVC config will only be updated if the project.yml | ||||
|     changed. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-dvc | ||||
|     """ | ||||
|     project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force) | ||||
| 
 | ||||
| 
 | ||||
| def project_update_dvc( | ||||
|     project_dir: Path, | ||||
|     workflow: Optional[str] = None, | ||||
|     *, | ||||
|     verbose: bool = False, | ||||
|     quiet: bool = False, | ||||
|     force: bool = False, | ||||
| ) -> None: | ||||
|     """Update the auto-generated Data Version Control (DVC) config file. A DVC | ||||
|     project can only define one pipeline, so you need to specify one workflow | ||||
|     defined in the project.yml. Will only update the file if the checksum changed. | ||||
| 
 | ||||
|     project_dir (Path): The project directory. | ||||
|     workflow (Optional[str]): Optional name of workflow defined in project.yml. | ||||
|         If not set, the first workflow will be used. | ||||
|     verbose (bool): Print more info. | ||||
|     quiet (bool): Print less info. | ||||
|     force (bool): Force update DVC config. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     updated = update_dvc_config( | ||||
|         project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force | ||||
|     ) | ||||
|     help_msg = "To execute the workflow with DVC, run: dvc repro" | ||||
|     if updated: | ||||
|         msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg) | ||||
|     else: | ||||
|         msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg) | ||||
| 
 | ||||
| 
 | ||||
| def update_dvc_config( | ||||
|     path: Path, | ||||
|     config: Dict[str, Any], | ||||
|     workflow: Optional[str] = None, | ||||
|     verbose: bool = False, | ||||
|     quiet: bool = False, | ||||
|     force: bool = False, | ||||
| ) -> bool: | ||||
|     """Re-run the DVC commands in dry mode and update dvc.yaml file in the | ||||
|     project directory. The file is auto-generated based on the config. The | ||||
|     first line of the auto-generated file specifies the hash of the config | ||||
|     dict, so if any of the config values change, the DVC config is regenerated. | ||||
| 
 | ||||
|     path (Path): The path to the project directory. | ||||
|     config (Dict[str, Any]): The loaded project.yml. | ||||
|     verbose (bool): Whether to print additional info (via DVC). | ||||
|     quiet (bool): Don't output anything (via DVC). | ||||
|     force (bool): Force update, even if hashes match. | ||||
|     RETURNS (bool): Whether the DVC config file was updated. | ||||
|     """ | ||||
|     ensure_dvc(path) | ||||
|     workflows = config.get("workflows", {}) | ||||
|     workflow_names = list(workflows.keys()) | ||||
|     check_workflows(workflow_names, workflow) | ||||
|     if not workflow: | ||||
|         workflow = workflow_names[0] | ||||
|     config_hash = get_hash(config) | ||||
|     path = path.resolve() | ||||
|     dvc_config_path = path / DVC_CONFIG | ||||
|     if dvc_config_path.exists(): | ||||
|         # Check if the file was generated using the current config, if not, redo | ||||
|         with dvc_config_path.open("r", encoding="utf8") as f: | ||||
|             ref_hash = f.readline().strip().replace("# ", "") | ||||
|         if ref_hash == config_hash and not force: | ||||
|             return False  # Nothing has changed in project.yml, don't need to update | ||||
|         dvc_config_path.unlink() | ||||
|     dvc_commands = [] | ||||
|     config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||
| 
 | ||||
|     # some flags that apply to every command | ||||
|     flags = [] | ||||
|     if verbose: | ||||
|         flags.append("--verbose") | ||||
|     if quiet: | ||||
|         flags.append("--quiet") | ||||
| 
 | ||||
|     for name in workflows[workflow]: | ||||
|         command = config_commands[name] | ||||
|         deps = command.get("deps", []) | ||||
|         outputs = command.get("outputs", []) | ||||
|         outputs_no_cache = command.get("outputs_no_cache", []) | ||||
|         if not deps and not outputs and not outputs_no_cache: | ||||
|             continue | ||||
|         # Default to the working dir as the project path since dvc.yaml is auto-generated | ||||
|         # and we don't want arbitrary paths in there | ||||
|         project_cmd = ["python", "-m", NAME, "project", "run", name] | ||||
|         deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] | ||||
|         outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] | ||||
|         outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] | ||||
| 
 | ||||
|         dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"] | ||||
|         if command.get("no_skip"): | ||||
|             dvc_cmd.append("--always-changed") | ||||
|         full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] | ||||
|         dvc_commands.append(join_command(full_cmd)) | ||||
| 
 | ||||
|     if not dvc_commands: | ||||
|         # If we don't check for this, then there will be an error when reading the | ||||
|         # config, since DVC wouldn't create it. | ||||
|         msg.fail( | ||||
|             "No usable commands for DVC found. This can happen if none of your " | ||||
|             "commands have dependencies or outputs.", | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
|     with working_dir(path): | ||||
|         for c in dvc_commands: | ||||
|             dvc_command = "dvc " + c | ||||
|             run_command(dvc_command) | ||||
|     with dvc_config_path.open("r+", encoding="utf8") as f: | ||||
|         content = f.read() | ||||
|         f.seek(0, 0) | ||||
|         f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}") | ||||
|     return True | ||||
| 
 | ||||
| 
 | ||||
| def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: | ||||
|     """Validate workflows provided in project.yml and check that a given | ||||
|     workflow can be used to generate a DVC config. | ||||
| 
 | ||||
|     workflows (List[str]): Names of the available workflows. | ||||
|     workflow (Optional[str]): The name of the workflow to convert. | ||||
|     """ | ||||
|     if not workflows: | ||||
|         msg.fail( | ||||
|             f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " | ||||
|             f"define at least one list of commands.", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if workflow is not None and workflow not in workflows: | ||||
|         msg.fail( | ||||
|             f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " | ||||
|             f"Available workflows: {', '.join(workflows)}", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if not workflow: | ||||
|         msg.warn( | ||||
|             f"No workflow specified for DVC pipeline. Using the first workflow " | ||||
|             f"defined in {PROJECT_FILE}: '{workflows[0]}'" | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def ensure_dvc(project_dir: Path) -> None: | ||||
|     """Ensure that the "dvc" command is available and that the current project | ||||
|     directory is an initialized DVC project. | ||||
|     """ | ||||
|     try: | ||||
|         subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL) | ||||
|     except Exception: | ||||
|         msg.fail( | ||||
|             "To use spaCy projects with DVC (Data Version Control), DVC needs " | ||||
|             "to be installed and the 'dvc' command needs to be available", | ||||
|             "You can install the Python package from pip (pip install dvc) or " | ||||
|             "conda (conda install -c conda-forge dvc). For more details, see the " | ||||
|             "documentation: https://dvc.org/doc/install", | ||||
|             exits=1, | ||||
|         ) | ||||
|     if not (project_dir / ".dvc").exists(): | ||||
|         msg.fail( | ||||
|             "Project not initialized as a DVC project", | ||||
|             "To initialize a DVC project, you can run 'dvc init' in the project " | ||||
|             "directory. For more details, see the documentation: " | ||||
|             "https://dvc.org/doc/command-reference/init", | ||||
|             exits=1, | ||||
|         ) | ||||
| from weasel.cli.dvc import * | ||||
|  |  | |||
|  | @ -1,67 +1 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .._util import Arg, load_project_config, logger, project_cli | ||||
| from .remote_storage import RemoteStorage, get_command_hash | ||||
| from .run import update_lockfile | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("pull") | ||||
| def project_pull_cli( | ||||
|     # fmt: off | ||||
|     remote: str = Arg("default", help="Name or path of remote storage"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Retrieve available precomputed outputs from a remote storage. | ||||
|     You can alias remotes in your project.yml by mapping them to storage paths. | ||||
|     A storage can be anything that the smart-open library can upload to, e.g. | ||||
|     AWS, Google Cloud Storage, SSH, local directories etc. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-pull | ||||
|     """ | ||||
|     for url, output_path in project_pull(project_dir, remote): | ||||
|         if url is not None: | ||||
|             msg.good(f"Pulled {output_path} from {url}") | ||||
| 
 | ||||
| 
 | ||||
| def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | ||||
|     # TODO: We don't have tests for this :(. It would take a bit of mockery to | ||||
|     # set up. I guess see if it breaks first? | ||||
|     config = load_project_config(project_dir) | ||||
|     if remote in config.get("remotes", {}): | ||||
|         remote = config["remotes"][remote] | ||||
|     storage = RemoteStorage(project_dir, remote) | ||||
|     commands = list(config.get("commands", [])) | ||||
|     # We use a while loop here because we don't know how the commands | ||||
|     # will be ordered. A command might need dependencies from one that's later | ||||
|     # in the list. | ||||
|     while commands: | ||||
|         for i, cmd in enumerate(list(commands)): | ||||
|             logger.debug("CMD: %s.", cmd["name"]) | ||||
|             deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||
|             if all(dep.exists() for dep in deps): | ||||
|                 cmd_hash = get_command_hash("", "", deps, cmd["script"]) | ||||
|                 for output_path in cmd.get("outputs", []): | ||||
|                     url = storage.pull(output_path, command_hash=cmd_hash) | ||||
|                     logger.debug( | ||||
|                         "URL: %s for %s with command hash %s", | ||||
|                         url, | ||||
|                         output_path, | ||||
|                         cmd_hash, | ||||
|                     ) | ||||
|                     yield url, output_path | ||||
| 
 | ||||
|                 out_locs = [project_dir / out for out in cmd.get("outputs", [])] | ||||
|                 if all(loc.exists() for loc in out_locs): | ||||
|                     update_lockfile(project_dir, cmd) | ||||
|                 # We remove the command from the list here, and break, so that | ||||
|                 # we iterate over the loop again. | ||||
|                 commands.pop(i) | ||||
|                 break | ||||
|             else: | ||||
|                 logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) | ||||
|         else: | ||||
|             # If we didn't break the for loop, break the while loop. | ||||
|             break | ||||
| from weasel.cli.pull import * | ||||
|  |  | |||
|  | @ -1,69 +1 @@ | |||
| from pathlib import Path | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from .._util import Arg, load_project_config, logger, project_cli | ||||
| from .remote_storage import RemoteStorage, get_command_hash, get_content_hash | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command("push") | ||||
| def project_push_cli( | ||||
|     # fmt: off | ||||
|     remote: str = Arg("default", help="Name or path of remote storage"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Persist outputs to a remote storage. You can alias remotes in your | ||||
|     project.yml by mapping them to storage paths. A storage can be anything that | ||||
|     the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH, | ||||
|     local directories etc. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-push | ||||
|     """ | ||||
|     for output_path, url in project_push(project_dir, remote): | ||||
|         if url is None: | ||||
|             msg.info(f"Skipping {output_path}") | ||||
|         else: | ||||
|             msg.good(f"Pushed {output_path} to {url}") | ||||
| 
 | ||||
| 
 | ||||
| def project_push(project_dir: Path, remote: str): | ||||
|     """Persist outputs to a remote storage. You can alias remotes in your project.yml | ||||
|     by mapping them to storage paths. A storage can be anything that the smart-open | ||||
|     library can upload to, e.g. gcs, aws, ssh, local directories etc | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     if remote in config.get("remotes", {}): | ||||
|         remote = config["remotes"][remote] | ||||
|     storage = RemoteStorage(project_dir, remote) | ||||
|     for cmd in config.get("commands", []): | ||||
|         logger.debug("CMD: %s", cmd["name"]) | ||||
|         deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||
|         if any(not dep.exists() for dep in deps): | ||||
|             logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) | ||||
|             continue | ||||
|         cmd_hash = get_command_hash( | ||||
|             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] | ||||
|         ) | ||||
|         logger.debug("CMD_HASH: %s", cmd_hash) | ||||
|         for output_path in cmd.get("outputs", []): | ||||
|             output_loc = project_dir / output_path | ||||
|             if output_loc.exists() and _is_not_empty_dir(output_loc): | ||||
|                 url = storage.push( | ||||
|                     output_path, | ||||
|                     command_hash=cmd_hash, | ||||
|                     content_hash=get_content_hash(output_loc), | ||||
|                 ) | ||||
|                 logger.debug( | ||||
|                     "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash | ||||
|                 ) | ||||
|                 yield output_path, url | ||||
| 
 | ||||
| 
 | ||||
| def _is_not_empty_dir(loc: Path): | ||||
|     if not loc.is_dir(): | ||||
|         return True | ||||
|     elif any(_is_not_empty_dir(child) for child in loc.iterdir()): | ||||
|         return True | ||||
|     else: | ||||
|         return False | ||||
| from weasel.cli.push import * | ||||
|  |  | |||
|  | @ -1,212 +1 @@ | |||
| import hashlib | ||||
| import os | ||||
| import site | ||||
| import tarfile | ||||
| import urllib.parse | ||||
| from pathlib import Path | ||||
| from typing import TYPE_CHECKING, Dict, List, Optional | ||||
| 
 | ||||
| from wasabi import msg | ||||
| 
 | ||||
| from ... import about | ||||
| from ...errors import Errors | ||||
| from ...git_info import GIT_VERSION | ||||
| from ...util import ENV_VARS, check_bool_env_var, get_minor_version | ||||
| from .._util import ( | ||||
|     download_file, | ||||
|     ensure_pathy, | ||||
|     get_checksum, | ||||
|     get_hash, | ||||
|     make_tempdir, | ||||
|     upload_file, | ||||
| ) | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from pathy import FluidPath  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| class RemoteStorage: | ||||
|     """Push and pull outputs to and from a remote file storage. | ||||
| 
 | ||||
|     Remotes can be anything that `smart-open` can support: AWS, GCS, file system, | ||||
|     ssh, etc. | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, project_root: Path, url: str, *, compression="gz"): | ||||
|         self.root = project_root | ||||
|         self.url = ensure_pathy(url) | ||||
|         self.compression = compression | ||||
| 
 | ||||
|     def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": | ||||
|         """Compress a file or directory within a project and upload it to a remote | ||||
|         storage. If an object exists at the full URL, nothing is done. | ||||
| 
 | ||||
|         Within the remote storage, files are addressed by their project path | ||||
|         (url encoded) and two user-supplied hashes, representing their creation | ||||
|         context and their file contents. If the URL already exists, the data is | ||||
|         not uploaded. Paths are archived and compressed prior to upload. | ||||
|         """ | ||||
|         loc = self.root / path | ||||
|         if not loc.exists(): | ||||
|             raise IOError(f"Cannot push {loc}: does not exist.") | ||||
|         url = self.make_url(path, command_hash, content_hash) | ||||
|         if url.exists(): | ||||
|             return url | ||||
|         tmp: Path | ||||
|         with make_tempdir() as tmp: | ||||
|             tar_loc = tmp / self.encode_name(str(path)) | ||||
|             mode_string = f"w:{self.compression}" if self.compression else "w" | ||||
|             with tarfile.open(tar_loc, mode=mode_string) as tar_file: | ||||
|                 tar_file.add(str(loc), arcname=str(path)) | ||||
|             upload_file(tar_loc, url) | ||||
|         return url | ||||
| 
 | ||||
|     def pull( | ||||
|         self, | ||||
|         path: Path, | ||||
|         *, | ||||
|         command_hash: Optional[str] = None, | ||||
|         content_hash: Optional[str] = None, | ||||
|     ) -> Optional["FluidPath"]: | ||||
|         """Retrieve a file from the remote cache. If the file already exists, | ||||
|         nothing is done. | ||||
| 
 | ||||
|         If the command_hash and/or content_hash are specified, only matching | ||||
|         results are returned. If no results are available, an error is raised. | ||||
|         """ | ||||
|         dest = self.root / path | ||||
|         if dest.exists(): | ||||
|             return None | ||||
|         url = self.find(path, command_hash=command_hash, content_hash=content_hash) | ||||
|         if url is None: | ||||
|             return url | ||||
|         else: | ||||
|             # Make sure the destination exists | ||||
|             if not dest.parent.exists(): | ||||
|                 dest.parent.mkdir(parents=True) | ||||
|             tmp: Path | ||||
|             with make_tempdir() as tmp: | ||||
|                 tar_loc = tmp / url.parts[-1] | ||||
|                 download_file(url, tar_loc) | ||||
|                 mode_string = f"r:{self.compression}" if self.compression else "r" | ||||
|                 with tarfile.open(tar_loc, mode=mode_string) as tar_file: | ||||
|                     # This requires that the path is added correctly, relative | ||||
|                     # to root. This is how we set things up in push() | ||||
| 
 | ||||
|                     # Disallow paths outside the current directory for the tar | ||||
|                     # file (CVE-2007-4559, directory traversal vulnerability) | ||||
|                     def is_within_directory(directory, target): | ||||
|                         abs_directory = os.path.abspath(directory) | ||||
|                         abs_target = os.path.abspath(target) | ||||
|                         prefix = os.path.commonprefix([abs_directory, abs_target]) | ||||
|                         return prefix == abs_directory | ||||
| 
 | ||||
|                     def safe_extract(tar, path): | ||||
|                         for member in tar.getmembers(): | ||||
|                             member_path = os.path.join(path, member.name) | ||||
|                             if not is_within_directory(path, member_path): | ||||
|                                 raise ValueError(Errors.E852) | ||||
|                         tar.extractall(path) | ||||
| 
 | ||||
|                     safe_extract(tar_file, self.root) | ||||
|         return url | ||||
| 
 | ||||
|     def find( | ||||
|         self, | ||||
|         path: Path, | ||||
|         *, | ||||
|         command_hash: Optional[str] = None, | ||||
|         content_hash: Optional[str] = None, | ||||
|     ) -> Optional["FluidPath"]: | ||||
|         """Find the best matching version of a file within the storage, | ||||
|         or `None` if no match can be found. If both the creation and content hash | ||||
|         are specified, only exact matches will be returned. Otherwise, the most | ||||
|         recent matching file is preferred. | ||||
|         """ | ||||
|         name = self.encode_name(str(path)) | ||||
|         urls = [] | ||||
|         if command_hash is not None and content_hash is not None: | ||||
|             url = self.url / name / command_hash / content_hash | ||||
|             urls = [url] if url.exists() else [] | ||||
|         elif command_hash is not None: | ||||
|             if (self.url / name / command_hash).exists(): | ||||
|                 urls = list((self.url / name / command_hash).iterdir()) | ||||
|         else: | ||||
|             if (self.url / name).exists(): | ||||
|                 for sub_dir in (self.url / name).iterdir(): | ||||
|                     urls.extend(sub_dir.iterdir()) | ||||
|                 if content_hash is not None: | ||||
|                     urls = [url for url in urls if url.parts[-1] == content_hash] | ||||
|         if len(urls) >= 2: | ||||
|             try: | ||||
|                 urls.sort(key=lambda x: x.stat().last_modified)  # type: ignore | ||||
|             except Exception: | ||||
|                 msg.warn( | ||||
|                     "Unable to sort remote files by last modified. The file(s) " | ||||
|                     "pulled from the cache may not be the most recent." | ||||
|                 ) | ||||
|         return urls[-1] if urls else None | ||||
| 
 | ||||
|     def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath": | ||||
|         """Construct a URL from a subpath, a creation hash and a content hash.""" | ||||
|         return self.url / self.encode_name(str(path)) / command_hash / content_hash | ||||
| 
 | ||||
|     def encode_name(self, name: str) -> str: | ||||
|         """Encode a subpath into a URL-safe name.""" | ||||
|         return urllib.parse.quote_plus(name) | ||||
| 
 | ||||
| 
 | ||||
| def get_content_hash(loc: Path) -> str: | ||||
|     return get_checksum(loc) | ||||
| 
 | ||||
| 
 | ||||
| def get_command_hash( | ||||
|     site_hash: str, env_hash: str, deps: List[Path], cmd: List[str] | ||||
| ) -> str: | ||||
|     """Create a hash representing the execution of a command. This includes the | ||||
|     currently installed packages, whatever environment variables have been marked | ||||
|     as relevant, and the command. | ||||
|     """ | ||||
|     if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION): | ||||
|         spacy_v = GIT_VERSION | ||||
|     else: | ||||
|         spacy_v = str(get_minor_version(about.__version__) or "") | ||||
|     dep_checksums = [get_checksum(dep) for dep in sorted(deps)] | ||||
|     hashes = [spacy_v, site_hash, env_hash] + dep_checksums | ||||
|     hashes.extend(cmd) | ||||
|     creation_bytes = "".join(hashes).encode("utf8") | ||||
|     return hashlib.md5(creation_bytes).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| def get_site_hash(): | ||||
|     """Hash the current Python environment's site-packages contents, including | ||||
|     the name and version of the libraries. The list we're hashing is what | ||||
|     `pip freeze` would output. | ||||
|     """ | ||||
|     site_dirs = site.getsitepackages() | ||||
|     if site.ENABLE_USER_SITE: | ||||
|         site_dirs.extend(site.getusersitepackages()) | ||||
|     packages = set() | ||||
|     for site_dir in site_dirs: | ||||
|         site_dir = Path(site_dir) | ||||
|         for subpath in site_dir.iterdir(): | ||||
|             if subpath.parts[-1].endswith("dist-info"): | ||||
|                 packages.add(subpath.parts[-1].replace(".dist-info", "")) | ||||
|     package_bytes = "".join(sorted(packages)).encode("utf8") | ||||
|     return hashlib.md5sum(package_bytes).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| def get_env_hash(env: Dict[str, str]) -> str: | ||||
|     """Construct a hash of the environment variables that will be passed into | ||||
|     the commands. | ||||
| 
 | ||||
|     Values in the env dict may be references to the current os.environ, using | ||||
|     the syntax $ENV_VAR to mean os.environ[ENV_VAR] | ||||
|     """ | ||||
|     env_vars = {} | ||||
|     for key, value in env.items(): | ||||
|         if value.startswith("$"): | ||||
|             env_vars[key] = os.environ.get(value[1:], "") | ||||
|         else: | ||||
|             env_vars[key] = value | ||||
|     return get_hash(env_vars) | ||||
| from weasel.cli.remote_storage import * | ||||
|  |  | |||
|  | @ -1,379 +1 @@ | |||
| import os.path | ||||
| import sys | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple | ||||
| 
 | ||||
| import srsly | ||||
| import typer | ||||
| from wasabi import msg | ||||
| from wasabi.util import locale_escape | ||||
| 
 | ||||
| from ... import about | ||||
| from ...git_info import GIT_VERSION | ||||
| from ...util import ( | ||||
|     ENV_VARS, | ||||
|     SimpleFrozenDict, | ||||
|     SimpleFrozenList, | ||||
|     check_bool_env_var, | ||||
|     is_cwd, | ||||
|     is_minor_version_match, | ||||
|     join_command, | ||||
|     run_command, | ||||
|     split_command, | ||||
|     working_dir, | ||||
| ) | ||||
| from .._util import ( | ||||
|     COMMAND, | ||||
|     PROJECT_FILE, | ||||
|     PROJECT_LOCK, | ||||
|     Arg, | ||||
|     Opt, | ||||
|     get_checksum, | ||||
|     get_hash, | ||||
|     load_project_config, | ||||
|     parse_config_overrides, | ||||
|     project_cli, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @project_cli.command( | ||||
|     "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} | ||||
| ) | ||||
| def project_run_cli( | ||||
|     # fmt: off | ||||
|     ctx: typer.Context,  # This is only used to read additional arguments | ||||
|     subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), | ||||
|     project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), | ||||
|     force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), | ||||
|     dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"), | ||||
|     show_help: bool = Opt(False, "--help", help="Show help message and available subcommands") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Run a named command or workflow defined in the project.yml. If a workflow | ||||
|     name is specified, all commands in the workflow are run, in order. If | ||||
|     commands define dependencies and/or outputs, they will only be re-run if | ||||
|     state has changed. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#project-run | ||||
|     """ | ||||
|     if show_help or not subcommand: | ||||
|         print_run_help(project_dir, subcommand) | ||||
|     else: | ||||
|         overrides = parse_config_overrides(ctx.args) | ||||
|         project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry) | ||||
| 
 | ||||
| 
 | ||||
| def project_run( | ||||
|     project_dir: Path, | ||||
|     subcommand: str, | ||||
|     *, | ||||
|     overrides: Dict[str, Any] = SimpleFrozenDict(), | ||||
|     force: bool = False, | ||||
|     dry: bool = False, | ||||
|     capture: bool = False, | ||||
|     skip_requirements_check: bool = False, | ||||
| ) -> None: | ||||
|     """Run a named script defined in the project.yml. If the script is part | ||||
|     of the default pipeline (defined in the "run" section), DVC is used to | ||||
|     execute the command, so it can determine whether to rerun it. It then | ||||
|     calls into "exec" to execute it. | ||||
| 
 | ||||
|     project_dir (Path): Path to project directory. | ||||
|     subcommand (str): Name of command to run. | ||||
|     overrides (Dict[str, Any]): Optional config overrides. | ||||
|     force (bool): Force re-running, even if nothing changed. | ||||
|     dry (bool): Perform a dry run and don't execute commands. | ||||
|     capture (bool): Whether to capture the output and errors of individual commands. | ||||
|         If False, the stdout and stderr will not be redirected, and if there's an error, | ||||
|         sys.exit will be called with the return code. You should use capture=False | ||||
|         when you want to turn over execution to the command, and capture=True | ||||
|         when you want to run the command more like a function. | ||||
|     skip_requirements_check (bool): Whether to skip the requirements check. | ||||
|     """ | ||||
|     config = load_project_config(project_dir, overrides=overrides) | ||||
|     commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} | ||||
|     workflows = config.get("workflows", {}) | ||||
|     validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) | ||||
| 
 | ||||
|     req_path = project_dir / "requirements.txt" | ||||
|     if not skip_requirements_check: | ||||
|         if config.get("check_requirements", True) and os.path.exists(req_path): | ||||
|             with req_path.open() as requirements_file: | ||||
|                 _check_requirements([req.strip() for req in requirements_file]) | ||||
| 
 | ||||
|     if subcommand in workflows: | ||||
|         msg.info(f"Running workflow '{subcommand}'") | ||||
|         for cmd in workflows[subcommand]: | ||||
|             project_run( | ||||
|                 project_dir, | ||||
|                 cmd, | ||||
|                 overrides=overrides, | ||||
|                 force=force, | ||||
|                 dry=dry, | ||||
|                 capture=capture, | ||||
|                 skip_requirements_check=True, | ||||
|             ) | ||||
|     else: | ||||
|         cmd = commands[subcommand] | ||||
|         for dep in cmd.get("deps", []): | ||||
|             if not (project_dir / dep).exists(): | ||||
|                 err = f"Missing dependency specified by command '{subcommand}': {dep}" | ||||
|                 err_help = "Maybe you forgot to run the 'project assets' command or a previous step?" | ||||
|                 err_exits = 1 if not dry else None | ||||
|                 msg.fail(err, err_help, exits=err_exits) | ||||
|         check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION) | ||||
|         with working_dir(project_dir) as current_dir: | ||||
|             msg.divider(subcommand) | ||||
|             rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit) | ||||
|             if not rerun and not force: | ||||
|                 msg.info(f"Skipping '{cmd['name']}': nothing changed") | ||||
|             else: | ||||
|                 run_commands(cmd["script"], dry=dry, capture=capture) | ||||
|                 if not dry: | ||||
|                     update_lockfile(current_dir, cmd) | ||||
| 
 | ||||
| 
 | ||||
| def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: | ||||
|     """Simulate a CLI help prompt using the info available in the project.yml. | ||||
| 
 | ||||
|     project_dir (Path): The project directory. | ||||
|     subcommand (Optional[str]): The subcommand or None. If a subcommand is | ||||
|         provided, the subcommand help is shown. Otherwise, the top-level help | ||||
|         and a list of available commands is printed. | ||||
|     """ | ||||
|     config = load_project_config(project_dir) | ||||
|     config_commands = config.get("commands", []) | ||||
|     commands = {cmd["name"]: cmd for cmd in config_commands} | ||||
|     workflows = config.get("workflows", {}) | ||||
|     project_loc = "" if is_cwd(project_dir) else project_dir | ||||
|     if subcommand: | ||||
|         validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand) | ||||
|         print(f"Usage: {COMMAND} project run {subcommand} {project_loc}") | ||||
|         if subcommand in commands: | ||||
|             help_text = commands[subcommand].get("help") | ||||
|             if help_text: | ||||
|                 print(f"\n{help_text}\n") | ||||
|         elif subcommand in workflows: | ||||
|             steps = workflows[subcommand] | ||||
|             print(f"\nWorkflow consisting of {len(steps)} commands:") | ||||
|             steps_data = [ | ||||
|                 (f"{i + 1}. {step}", commands[step].get("help", "")) | ||||
|                 for i, step in enumerate(steps) | ||||
|             ] | ||||
|             msg.table(steps_data) | ||||
|             help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help" | ||||
|             print(f"For command details, run: {help_cmd}") | ||||
|     else: | ||||
|         print("") | ||||
|         title = config.get("title") | ||||
|         if title: | ||||
|             print(f"{locale_escape(title)}\n") | ||||
|         if config_commands: | ||||
|             print(f"Available commands in {PROJECT_FILE}") | ||||
|             print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}") | ||||
|             msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands]) | ||||
|         if workflows: | ||||
|             print(f"Available workflows in {PROJECT_FILE}") | ||||
|             print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}") | ||||
|             msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()]) | ||||
| 
 | ||||
| 
 | ||||
| def run_commands( | ||||
|     commands: Iterable[str] = SimpleFrozenList(), | ||||
|     silent: bool = False, | ||||
|     dry: bool = False, | ||||
|     capture: bool = False, | ||||
| ) -> None: | ||||
|     """Run a sequence of commands in a subprocess, in order. | ||||
| 
 | ||||
|     commands (List[str]): The string commands. | ||||
|     silent (bool): Don't print the commands. | ||||
|     dry (bool): Perform a dry run and don't execut anything. | ||||
|     capture (bool): Whether to capture the output and errors of individual commands. | ||||
|         If False, the stdout and stderr will not be redirected, and if there's an error, | ||||
|         sys.exit will be called with the return code. You should use capture=False | ||||
|         when you want to turn over execution to the command, and capture=True | ||||
|         when you want to run the command more like a function. | ||||
|     """ | ||||
|     for c in commands: | ||||
|         command = split_command(c) | ||||
|         # Not sure if this is needed or a good idea. Motivation: users may often | ||||
|         # use commands in their config that reference "python" and we want to | ||||
|         # make sure that it's always executing the same Python that spaCy is | ||||
|         # executed with and the pip in the same env, not some other Python/pip. | ||||
|         # Also ensures cross-compatibility if user 1 writes "python3" (because | ||||
|         # that's how it's set up on their system), and user 2 without the | ||||
|         # shortcut tries to re-run the command. | ||||
|         if len(command) and command[0] in ("python", "python3"): | ||||
|             command[0] = sys.executable | ||||
|         elif len(command) and command[0] in ("pip", "pip3"): | ||||
|             command = [sys.executable, "-m", "pip", *command[1:]] | ||||
|         if not silent: | ||||
|             print(f"Running command: {join_command(command)}") | ||||
|         if not dry: | ||||
|             run_command(command, capture=capture) | ||||
| 
 | ||||
| 
 | ||||
| def validate_subcommand( | ||||
|     commands: Sequence[str], workflows: Sequence[str], subcommand: str | ||||
| ) -> None: | ||||
|     """Check that a subcommand is valid and defined. Raises an error otherwise. | ||||
| 
 | ||||
|     commands (Sequence[str]): The available commands. | ||||
|     subcommand (str): The subcommand. | ||||
|     """ | ||||
|     if not commands and not workflows: | ||||
|         msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1) | ||||
|     if subcommand not in commands and subcommand not in workflows: | ||||
|         help_msg = [] | ||||
|         if subcommand in ["assets", "asset"]: | ||||
|             help_msg.append("Did you mean to run: python -m spacy project assets?") | ||||
|         if commands: | ||||
|             help_msg.append(f"Available commands: {', '.join(commands)}") | ||||
|         if workflows: | ||||
|             help_msg.append(f"Available workflows: {', '.join(workflows)}") | ||||
|         msg.fail( | ||||
|             f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}", | ||||
|             ". ".join(help_msg), | ||||
|             exits=1, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def check_rerun( | ||||
|     project_dir: Path, | ||||
|     command: Dict[str, Any], | ||||
|     *, | ||||
|     check_spacy_version: bool = True, | ||||
|     check_spacy_commit: bool = False, | ||||
| ) -> bool: | ||||
|     """Check if a command should be rerun because its settings or inputs/outputs | ||||
|     changed. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     command (Dict[str, Any]): The command, as defined in the project.yml. | ||||
|     strict_version (bool): | ||||
|     RETURNS (bool): Whether to re-run the command. | ||||
|     """ | ||||
|     # Always rerun if no-skip is set | ||||
|     if command.get("no_skip", False): | ||||
|         return True | ||||
|     lock_path = project_dir / PROJECT_LOCK | ||||
|     if not lock_path.exists():  # We don't have a lockfile, run command | ||||
|         return True | ||||
|     data = srsly.read_yaml(lock_path) | ||||
|     if command["name"] not in data:  # We don't have info about this command | ||||
|         return True | ||||
|     entry = data[command["name"]] | ||||
|     # Always run commands with no outputs (otherwise they'd always be skipped) | ||||
|     if not entry.get("outs", []): | ||||
|         return True | ||||
|     # Always rerun if spaCy version or commit hash changed | ||||
|     spacy_v = entry.get("spacy_version") | ||||
|     commit = entry.get("spacy_git_version") | ||||
|     if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__): | ||||
|         info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)" | ||||
|         msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}") | ||||
|         return True | ||||
|     if check_spacy_commit and commit != GIT_VERSION: | ||||
|         info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)" | ||||
|         msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}") | ||||
|         return True | ||||
|     # If the entry in the lockfile matches the lockfile entry that would be | ||||
|     # generated from the current command, we don't rerun because it means that | ||||
|     # all inputs/outputs, hashes and scripts are the same and nothing changed | ||||
|     lock_entry = get_lock_entry(project_dir, command) | ||||
|     exclude = ["spacy_version", "spacy_git_version"] | ||||
|     return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude) | ||||
| 
 | ||||
| 
 | ||||
| def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: | ||||
|     """Update the lockfile after running a command. Will create a lockfile if | ||||
|     it doesn't yet exist and will add an entry for the current command, its | ||||
|     script and dependencies/outputs. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     command (Dict[str, Any]): The command, as defined in the project.yml. | ||||
|     """ | ||||
|     lock_path = project_dir / PROJECT_LOCK | ||||
|     if not lock_path.exists(): | ||||
|         srsly.write_yaml(lock_path, {}) | ||||
|         data = {} | ||||
|     else: | ||||
|         data = srsly.read_yaml(lock_path) | ||||
|     data[command["name"]] = get_lock_entry(project_dir, command) | ||||
|     srsly.write_yaml(lock_path, data) | ||||
| 
 | ||||
| 
 | ||||
| def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]: | ||||
|     """Get a lockfile entry for a given command. An entry includes the command, | ||||
|     the script (command steps) and a list of dependencies and outputs with | ||||
|     their paths and file hashes, if available. The format is based on the | ||||
|     dvc.lock files, to keep things consistent. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     command (Dict[str, Any]): The command, as defined in the project.yml. | ||||
|     RETURNS (Dict[str, Any]): The lockfile entry. | ||||
|     """ | ||||
|     deps = get_fileinfo(project_dir, command.get("deps", [])) | ||||
|     outs = get_fileinfo(project_dir, command.get("outputs", [])) | ||||
|     outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", [])) | ||||
|     return { | ||||
|         "cmd": f"{COMMAND} run {command['name']}", | ||||
|         "script": command["script"], | ||||
|         "deps": deps, | ||||
|         "outs": [*outs, *outs_nc], | ||||
|         "spacy_version": about.__version__, | ||||
|         "spacy_git_version": GIT_VERSION, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]: | ||||
|     """Generate the file information for a list of paths (dependencies, outputs). | ||||
|     Includes the file path and the file's checksum. | ||||
| 
 | ||||
|     project_dir (Path): The current project directory. | ||||
|     paths (List[str]): The file paths. | ||||
|     RETURNS (List[Dict[str, str]]): The lockfile entry for a file. | ||||
|     """ | ||||
|     data = [] | ||||
|     for path in paths: | ||||
|         file_path = project_dir / path | ||||
|         md5 = get_checksum(file_path) if file_path.exists() else None | ||||
|         data.append({"path": path, "md5": md5}) | ||||
|     return data | ||||
| 
 | ||||
| 
 | ||||
| def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: | ||||
|     """Checks whether requirements are installed and free of version conflicts. | ||||
|     requirements (List[str]): List of requirements. | ||||
|     RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts | ||||
|         exist. | ||||
|     """ | ||||
|     import pkg_resources | ||||
| 
 | ||||
|     failed_pkgs_msgs: List[str] = [] | ||||
|     conflicting_pkgs_msgs: List[str] = [] | ||||
| 
 | ||||
|     for req in requirements: | ||||
|         try: | ||||
|             pkg_resources.require(req) | ||||
|         except pkg_resources.DistributionNotFound as dnf: | ||||
|             failed_pkgs_msgs.append(dnf.report()) | ||||
|         except pkg_resources.VersionConflict as vc: | ||||
|             conflicting_pkgs_msgs.append(vc.report()) | ||||
|         except Exception: | ||||
|             msg.warn( | ||||
|                 f"Unable to check requirement: {req} " | ||||
|                 "Checks are currently limited to requirement specifiers " | ||||
|                 "(PEP 508)" | ||||
|             ) | ||||
| 
 | ||||
|     if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs): | ||||
|         msg.warn( | ||||
|             title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up " | ||||
|             "correctly and you installed all requirements specified in your project's requirements.txt: " | ||||
|         ) | ||||
|         for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs: | ||||
|             msg.text(pgk_msg) | ||||
| 
 | ||||
|     return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0 | ||||
| from weasel.cli.run import * | ||||
|  |  | |||
|  | @ -271,8 +271,9 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  | @ -308,8 +309,9 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.textcat_multilabel.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  | @ -542,14 +544,15 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
|  | @ -570,15 +573,17 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat_multilabel.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat_multilabel.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| {%- endif %} | ||||
|  |  | |||
|  | @ -47,7 +47,8 @@ def train_cli( | |||
| 
 | ||||
|     DOCS: https://spacy.io/api/cli#train | ||||
|     """ | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|     if verbose: | ||||
|         util.logger.setLevel(logging.DEBUG) | ||||
|     overrides = parse_config_overrides(ctx.args) | ||||
|     import_code_paths(code_path) | ||||
|     train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) | ||||
|  |  | |||
|  | @ -26,6 +26,9 @@ batch_size = 1000 | |||
| [nlp.tokenizer] | ||||
| @tokenizers = "spacy.Tokenizer.v1" | ||||
| 
 | ||||
| [nlp.vectors] | ||||
| @vectors = "spacy.Vectors.v1" | ||||
| 
 | ||||
| # The pipeline components and their models | ||||
| [components] | ||||
| 
 | ||||
|  |  | |||
|  | @ -142,7 +142,25 @@ class SpanRenderer: | |||
|         spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. | ||||
|         title (str / None): Document title set in Doc.user_data['title']. | ||||
|         """ | ||||
|         per_token_info = [] | ||||
|         per_token_info = self._assemble_per_token_info(tokens, spans) | ||||
|         markup = self._render_markup(per_token_info) | ||||
|         markup = TPL_SPANS.format(content=markup, dir=self.direction) | ||||
|         if title: | ||||
|             markup = TPL_TITLE.format(title=title) + markup | ||||
|         return markup | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _assemble_per_token_info( | ||||
|         tokens: List[str], spans: List[Dict[str, Any]] | ||||
|     ) -> List[Dict[str, List[Dict[str, Any]]]]: | ||||
|         """Assembles token info used to generate markup in render_spans(). | ||||
|         tokens (List[str]): Tokens in text. | ||||
|         spans (List[Dict[str, Any]]): Spans in text. | ||||
|         RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens | ||||
|             and spans. | ||||
|         """ | ||||
|         per_token_info: List[Dict[str, List[Dict[str, Any]]]] = [] | ||||
| 
 | ||||
|         # we must sort so that we can correctly describe when spans need to "stack" | ||||
|         # which is determined by their start token, then span length (longer spans on top), | ||||
|         # then break any remaining ties with the span label | ||||
|  | @ -154,21 +172,22 @@ class SpanRenderer: | |||
|                 s["label"], | ||||
|             ), | ||||
|         ) | ||||
| 
 | ||||
|         for s in spans: | ||||
|             # this is the vertical 'slot' that the span will be rendered in | ||||
|             # vertical_position = span_label_offset + (offset_step * (slot - 1)) | ||||
|             s["render_slot"] = 0 | ||||
| 
 | ||||
|         for idx, token in enumerate(tokens): | ||||
|             # Identify if a token belongs to a Span (and which) and if it's a | ||||
|             # start token of said Span. We'll use this for the final HTML render | ||||
|             token_markup: Dict[str, Any] = {} | ||||
|             token_markup["text"] = token | ||||
|             concurrent_spans = 0 | ||||
|             intersecting_spans: List[Dict[str, Any]] = [] | ||||
|             entities = [] | ||||
|             for span in spans: | ||||
|                 ent = {} | ||||
|                 if span["start_token"] <= idx < span["end_token"]: | ||||
|                     concurrent_spans += 1 | ||||
|                     span_start = idx == span["start_token"] | ||||
|                     ent["label"] = span["label"] | ||||
|                     ent["is_start"] = span_start | ||||
|  | @ -176,7 +195,12 @@ class SpanRenderer: | |||
|                         # When the span starts, we need to know how many other | ||||
|                         # spans are on the 'span stack' and will be rendered. | ||||
|                         # This value becomes the vertical render slot for this entire span | ||||
|                         span["render_slot"] = concurrent_spans | ||||
|                         span["render_slot"] = ( | ||||
|                             intersecting_spans[-1]["render_slot"] | ||||
|                             if len(intersecting_spans) | ||||
|                             else 0 | ||||
|                         ) + 1 | ||||
|                     intersecting_spans.append(span) | ||||
|                     ent["render_slot"] = span["render_slot"] | ||||
|                     kb_id = span.get("kb_id", "") | ||||
|                     kb_url = span.get("kb_url", "#") | ||||
|  | @ -193,11 +217,8 @@ class SpanRenderer: | |||
|                     span["render_slot"] = 0 | ||||
|             token_markup["entities"] = entities | ||||
|             per_token_info.append(token_markup) | ||||
|         markup = self._render_markup(per_token_info) | ||||
|         markup = TPL_SPANS.format(content=markup, dir=self.direction) | ||||
|         if title: | ||||
|             markup = TPL_TITLE.format(title=title) + markup | ||||
|         return markup | ||||
| 
 | ||||
|         return per_token_info | ||||
| 
 | ||||
|     def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: | ||||
|         """Render the markup from per-token information""" | ||||
|  | @ -313,6 +334,8 @@ class DependencyRenderer: | |||
|                 self.lang = settings.get("lang", DEFAULT_LANG) | ||||
|             render_id = f"{id_prefix}-{i}" | ||||
|             svg = self.render_svg(render_id, p["words"], p["arcs"]) | ||||
|             if p.get("title"): | ||||
|                 svg = TPL_TITLE.format(title=p.get("title")) + svg | ||||
|             rendered.append(svg) | ||||
|         if page: | ||||
|             content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered]) | ||||
|  | @ -565,7 +588,7 @@ class EntityRenderer: | |||
|             for i, fragment in enumerate(fragments): | ||||
|                 markup += escape_html(fragment) | ||||
|                 if len(fragments) > 1 and i != len(fragments) - 1: | ||||
|                     markup += "</br>" | ||||
|                     markup += "<br>" | ||||
|             if self.ents is None or label.upper() in self.ents: | ||||
|                 color = self.colors.get(label.upper(), self.default_color) | ||||
|                 ent_settings = { | ||||
|  | @ -583,7 +606,7 @@ class EntityRenderer: | |||
|         for i, fragment in enumerate(fragments): | ||||
|             markup += escape_html(fragment) | ||||
|             if len(fragments) > 1 and i != len(fragments) - 1: | ||||
|                 markup += "</br>" | ||||
|                 markup += "<br>" | ||||
|         markup = TPL_ENTS.format(content=markup, dir=self.direction) | ||||
|         if title: | ||||
|             markup = TPL_TITLE.format(title=title) + markup | ||||
|  |  | |||
|  | @ -214,6 +214,7 @@ class Warnings(metaclass=ErrorsWithCodes): | |||
|     W125 = ("The StaticVectors key_attr is no longer used. To set a custom " | ||||
|             "key attribute for vectors, configure it through Vectors(attr=) or " | ||||
|             "'spacy init vectors --attr'") | ||||
|     W126 = ("These keys are unsupported: {unsupported}") | ||||
| 
 | ||||
|     # v4 warning strings | ||||
|     W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " | ||||
|  | @ -226,7 +227,6 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|     E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " | ||||
|             "This usually happens when spaCy calls `nlp.{method}` with a custom " | ||||
|             "component name that's not registered on the current language class. " | ||||
|             "If you're using a Transformer, make sure to install 'spacy-transformers'. " | ||||
|             "If you're using a custom component, make sure you've added the " | ||||
|             "decorator `@Language.component` (for function components) or " | ||||
|             "`@Language.factory` (for class components).\n\nAvailable " | ||||
|  | @ -551,12 +551,12 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|             "during training, make sure to include it in 'annotating components'") | ||||
| 
 | ||||
|     # New errors added in v3.x | ||||
|     E849 = ("The vocab only supports {method} for vectors of type " | ||||
|             "spacy.vectors.Vectors, not {vectors_type}.") | ||||
|     E850 = ("The PretrainVectors objective currently only supports default or " | ||||
|             "floret vectors, not {mode} vectors.") | ||||
|     E851 = ("The 'textcat' component labels should only have values of 0 or 1, " | ||||
|             "but found value of '{val}'.") | ||||
|     E852 = ("The tar file pulled from the remote attempted an unsafe path " | ||||
|             "traversal.") | ||||
|     E853 = ("Unsupported component factory name '{name}'. The character '.' is " | ||||
|             "not permitted in factory names.") | ||||
|     E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " | ||||
|  | @ -970,6 +970,12 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|              " 'min_length': {min_length}, 'max_length': {max_length}") | ||||
|     E1054 = ("The text, including whitespace, must match between reference and " | ||||
|              "predicted docs when training {component}.") | ||||
|     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " | ||||
|              "but only callbacks with one or three parameters are supported") | ||||
|     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.") | ||||
|     E1057 = ("The `TextCatReduce` architecture must be used with at least one " | ||||
|              "reduction. Please enable one of `use_reduce_first`, " | ||||
|              "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.") | ||||
| 
 | ||||
|     # v4 error strings | ||||
|     E4000 = ("Expected a Doc as input, but got: '{type}'") | ||||
|  |  | |||
|  | @ -2,4 +2,9 @@ from .candidate import Candidate, InMemoryCandidate | |||
| from .kb import KnowledgeBase | ||||
| from .kb_in_memory import InMemoryLookupKB | ||||
| 
 | ||||
| __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] | ||||
| __all__ = [ | ||||
|     "Candidate", | ||||
|     "KnowledgeBase", | ||||
|     "InMemoryCandidate", | ||||
|     "InMemoryLookupKB", | ||||
| ] | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| # cython: infer_types=True | ||||
| 
 | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| # cython: infer_types=True | ||||
| 
 | ||||
| from pathlib import Path | ||||
| from typing import Iterable, Tuple, Union | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| # cython: infer_types=True | ||||
| from typing import Any, Callable, Dict, Iterable | ||||
| 
 | ||||
| import srsly | ||||
|  |  | |||
|  | @ -6,7 +6,8 @@ _num_words = [ | |||
|     "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", | ||||
|     "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", | ||||
|     "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", | ||||
|     "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion" | ||||
|     "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion", | ||||
|     "septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion" | ||||
| ] | ||||
| _ordinal_words = [ | ||||
|     "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", | ||||
|  | @ -14,7 +15,8 @@ _ordinal_words = [ | |||
|     "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", | ||||
|     "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", | ||||
|     "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth", | ||||
|     "trillionth", "quadrillionth", "gajillionth", "bazillionth" | ||||
|     "trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth", | ||||
|     "octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth" | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
|  |  | |||
|  | @ -163,7 +163,7 @@ class SpanishLemmatizer(Lemmatizer): | |||
|         for old, new in self.lookups.get_table("lemma_rules").get("det", []): | ||||
|             if word == old: | ||||
|                 return [new] | ||||
|         # If none of the specfic rules apply, search in the common rules for | ||||
|         # If none of the specific rules apply, search in the common rules for | ||||
|         # determiners and pronouns that follow a unique pattern for | ||||
|         # lemmatization. If the word is in the list, return the corresponding | ||||
|         # lemma. | ||||
|  | @ -291,7 +291,7 @@ class SpanishLemmatizer(Lemmatizer): | |||
|         for old, new in self.lookups.get_table("lemma_rules").get("pron", []): | ||||
|             if word == old: | ||||
|                 return [new] | ||||
|         # If none of the specfic rules apply, search in the common rules for | ||||
|         # If none of the specific rules apply, search in the common rules for | ||||
|         # determiners and pronouns that follow a unique pattern for | ||||
|         # lemmatization. If the word is in the list, return the corresponding | ||||
|         # lemma. | ||||
|  |  | |||
							
								
								
									
										18
									
								
								spacy/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class FaroeseDefaults(BaseDefaults): | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
| 
 | ||||
| 
 | ||||
| class Faroese(Language): | ||||
|     lang = "fo" | ||||
|     Defaults = FaroeseDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["Faroese"] | ||||
							
								
								
									
										90
									
								
								spacy/lang/fo/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										90
									
								
								spacy/lang/fo/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,90 @@ | |||
| from ...symbols import ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| for orth in [ | ||||
|     "apr.", | ||||
|     "aug.", | ||||
|     "avgr.", | ||||
|     "árg.", | ||||
|     "ávís.", | ||||
|     "beinl.", | ||||
|     "blkv.", | ||||
|     "blaðkv.", | ||||
|     "blm.", | ||||
|     "blaðm.", | ||||
|     "bls.", | ||||
|     "blstj.", | ||||
|     "blaðstj.", | ||||
|     "des.", | ||||
|     "eint.", | ||||
|     "febr.", | ||||
|     "fyrrv.", | ||||
|     "góðk.", | ||||
|     "h.m.", | ||||
|     "innt.", | ||||
|     "jan.", | ||||
|     "kl.", | ||||
|     "m.a.", | ||||
|     "mðr.", | ||||
|     "mió.", | ||||
|     "nr.", | ||||
|     "nto.", | ||||
|     "nov.", | ||||
|     "nút.", | ||||
|     "o.a.", | ||||
|     "o.a.m.", | ||||
|     "o.a.tíl.", | ||||
|     "o.fl.", | ||||
|     "ff.", | ||||
|     "o.m.a.", | ||||
|     "o.o.", | ||||
|     "o.s.fr.", | ||||
|     "o.tíl.", | ||||
|     "o.ø.", | ||||
|     "okt.", | ||||
|     "omf.", | ||||
|     "pst.", | ||||
|     "ritstj.", | ||||
|     "sbr.", | ||||
|     "sms.", | ||||
|     "smst.", | ||||
|     "smb.", | ||||
|     "sb.", | ||||
|     "sbrt.", | ||||
|     "sp.", | ||||
|     "sept.", | ||||
|     "spf.", | ||||
|     "spsk.", | ||||
|     "t.e.", | ||||
|     "t.s.", | ||||
|     "t.s.s.", | ||||
|     "tlf.", | ||||
|     "tel.", | ||||
|     "tsk.", | ||||
|     "t.o.v.", | ||||
|     "t.d.", | ||||
|     "uml.", | ||||
|     "ums.", | ||||
|     "uppl.", | ||||
|     "upprfr.", | ||||
|     "uppr.", | ||||
|     "útg.", | ||||
|     "útl.", | ||||
|     "útr.", | ||||
|     "vanl.", | ||||
|     "v.", | ||||
|     "v.h.", | ||||
|     "v.ø.o.", | ||||
|     "viðm.", | ||||
|     "viðv.", | ||||
|     "vm.", | ||||
|     "v.m.", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
|     capitalized = orth.capitalize() | ||||
|     _exc[capitalized] = [{ORTH: capitalized}] | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||
|  | @ -15,6 +15,7 @@ _prefixes = ( | |||
|     [ | ||||
|         "†", | ||||
|         "⸏", | ||||
|         "〈", | ||||
|     ] | ||||
|     + LIST_PUNCT | ||||
|     + LIST_ELLIPSES | ||||
|  | @ -31,6 +32,7 @@ _suffixes = ( | |||
|     + [ | ||||
|         "†", | ||||
|         "⸎", | ||||
|         "〉", | ||||
|         r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]", | ||||
|     ] | ||||
| ) | ||||
|  |  | |||
							
								
								
									
										20
									
								
								spacy/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from ..nb import SYNTAX_ITERATORS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class NorwegianNynorskDefaults(BaseDefaults): | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
| 
 | ||||
| 
 | ||||
| class NorwegianNynorsk(Language): | ||||
|     lang = "nn" | ||||
|     Defaults = NorwegianNynorskDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["NorwegianNynorsk"] | ||||
							
								
								
									
										15
									
								
								spacy/lang/nn/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/nn/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.nn.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) | ||||
| sentences = [ | ||||
|     "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", | ||||
|     "Det er ein meir enn i same periode i fjor.", | ||||
|     "Det har lava ned enorme snømengder i store delar av Europa den siste tida.", | ||||
|     "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.", | ||||
| ] | ||||
							
								
								
									
										74
									
								
								spacy/lang/nn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								spacy/lang/nn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,74 @@ | |||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     PUNCT, | ||||
|     UNITS, | ||||
| ) | ||||
| from ..punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
| _list_punct = [x for x in LIST_PUNCT if x != "#"] | ||||
| _list_icons = [x for x in LIST_ICONS if x != "°"] | ||||
| _list_icons = [x.replace("\\u00B0", "") for x in _list_icons] | ||||
| _list_quotes = [x for x in LIST_QUOTES if x != "\\'"] | ||||
| 
 | ||||
| 
 | ||||
| _prefixes = ( | ||||
|     ["§", "%", "=", "—", "–", r"\+(?![0-9])"] | ||||
|     + _list_punct | ||||
|     + LIST_ELLIPSES | ||||
|     + LIST_QUOTES | ||||
|     + LIST_CURRENCY | ||||
|     + LIST_ICONS | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|     + _list_icons | ||||
|     + [ | ||||
|         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     LIST_PUNCT | ||||
|     + LIST_ELLIPSES | ||||
|     + _list_quotes | ||||
|     + _list_icons | ||||
|     + ["—", "–"] | ||||
|     + [ | ||||
|         r"(?<=[0-9])\+", | ||||
|         r"(?<=°[FfCcKk])\.", | ||||
|         r"(?<=[0-9])(?:{c})".format(c=CURRENCY), | ||||
|         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||
|         r"(?<=[{al}{e}{p}(?:{q})])\.".format( | ||||
|             al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT | ||||
|         ), | ||||
|         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), | ||||
|     ] | ||||
|     + [r"(?<=[^sSxXzZ])'"] | ||||
| ) | ||||
| _suffixes += [ | ||||
|     suffix | ||||
|     for suffix in TOKENIZER_SUFFIXES | ||||
|     if suffix not in ["'s", "'S", "’s", "’S", r"\'"] | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = _prefixes | ||||
| TOKENIZER_INFIXES = _infixes | ||||
| TOKENIZER_SUFFIXES = _suffixes | ||||
							
								
								
									
										228
									
								
								spacy/lang/nn/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										228
									
								
								spacy/lang/nn/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,228 @@ | |||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| 
 | ||||
| for exc_data in [ | ||||
|     {ORTH: "jan.", NORM: "januar"}, | ||||
|     {ORTH: "feb.", NORM: "februar"}, | ||||
|     {ORTH: "mar.", NORM: "mars"}, | ||||
|     {ORTH: "apr.", NORM: "april"}, | ||||
|     {ORTH: "jun.", NORM: "juni"}, | ||||
|     # note: "jul." is in the simple list below without a NORM exception | ||||
|     {ORTH: "aug.", NORM: "august"}, | ||||
|     {ORTH: "sep.", NORM: "september"}, | ||||
|     {ORTH: "okt.", NORM: "oktober"}, | ||||
|     {ORTH: "nov.", NORM: "november"}, | ||||
|     {ORTH: "des.", NORM: "desember"}, | ||||
| ]: | ||||
|     _exc[exc_data[ORTH]] = [exc_data] | ||||
| 
 | ||||
| 
 | ||||
| for orth in [ | ||||
|     "Ap.", | ||||
|     "Aq.", | ||||
|     "Ca.", | ||||
|     "Chr.", | ||||
|     "Co.", | ||||
|     "Dr.", | ||||
|     "F.eks.", | ||||
|     "Fr.p.", | ||||
|     "Frp.", | ||||
|     "Grl.", | ||||
|     "Kr.", | ||||
|     "Kr.F.", | ||||
|     "Kr.F.s", | ||||
|     "Mr.", | ||||
|     "Mrs.", | ||||
|     "Pb.", | ||||
|     "Pr.", | ||||
|     "Sp.", | ||||
|     "St.", | ||||
|     "a.m.", | ||||
|     "ad.", | ||||
|     "adm.dir.", | ||||
|     "adr.", | ||||
|     "b.c.", | ||||
|     "bl.a.", | ||||
|     "bla.", | ||||
|     "bm.", | ||||
|     "bnr.", | ||||
|     "bto.", | ||||
|     "c.c.", | ||||
|     "ca.", | ||||
|     "cand.mag.", | ||||
|     "co.", | ||||
|     "d.d.", | ||||
|     "d.m.", | ||||
|     "d.y.", | ||||
|     "dept.", | ||||
|     "dr.", | ||||
|     "dr.med.", | ||||
|     "dr.philos.", | ||||
|     "dr.psychol.", | ||||
|     "dss.", | ||||
|     "dvs.", | ||||
|     "e.Kr.", | ||||
|     "e.l.", | ||||
|     "eg.", | ||||
|     "eig.", | ||||
|     "ekskl.", | ||||
|     "el.", | ||||
|     "et.", | ||||
|     "etc.", | ||||
|     "etg.", | ||||
|     "ev.", | ||||
|     "evt.", | ||||
|     "f.", | ||||
|     "f.Kr.", | ||||
|     "f.eks.", | ||||
|     "f.o.m.", | ||||
|     "fhv.", | ||||
|     "fk.", | ||||
|     "foreg.", | ||||
|     "fork.", | ||||
|     "fv.", | ||||
|     "fvt.", | ||||
|     "g.", | ||||
|     "gl.", | ||||
|     "gno.", | ||||
|     "gnr.", | ||||
|     "grl.", | ||||
|     "gt.", | ||||
|     "h.r.adv.", | ||||
|     "hhv.", | ||||
|     "hoh.", | ||||
|     "hr.", | ||||
|     "ifb.", | ||||
|     "ifm.", | ||||
|     "iht.", | ||||
|     "inkl.", | ||||
|     "istf.", | ||||
|     "jf.", | ||||
|     "jr.", | ||||
|     "jul.", | ||||
|     "juris.", | ||||
|     "kfr.", | ||||
|     "kgl.", | ||||
|     "kgl.res.", | ||||
|     "kl.", | ||||
|     "komm.", | ||||
|     "kr.", | ||||
|     "kst.", | ||||
|     "lat.", | ||||
|     "lø.", | ||||
|     "m.a.", | ||||
|     "m.a.o.", | ||||
|     "m.fl.", | ||||
|     "m.m.", | ||||
|     "m.v.", | ||||
|     "ma.", | ||||
|     "mag.art.", | ||||
|     "md.", | ||||
|     "mfl.", | ||||
|     "mht.", | ||||
|     "mill.", | ||||
|     "min.", | ||||
|     "mnd.", | ||||
|     "moh.", | ||||
|     "mrd.", | ||||
|     "muh.", | ||||
|     "mv.", | ||||
|     "mva.", | ||||
|     "n.å.", | ||||
|     "ndf.", | ||||
|     "nr.", | ||||
|     "nto.", | ||||
|     "nyno.", | ||||
|     "o.a.", | ||||
|     "o.l.", | ||||
|     "obl.", | ||||
|     "off.", | ||||
|     "ofl.", | ||||
|     "on.", | ||||
|     "op.", | ||||
|     "org.", | ||||
|     "osv.", | ||||
|     "ovf.", | ||||
|     "p.", | ||||
|     "p.a.", | ||||
|     "p.g.a.", | ||||
|     "p.m.", | ||||
|     "p.t.", | ||||
|     "pga.", | ||||
|     "ph.d.", | ||||
|     "pkt.", | ||||
|     "pr.", | ||||
|     "pst.", | ||||
|     "pt.", | ||||
|     "red.anm.", | ||||
|     "ref.", | ||||
|     "res.", | ||||
|     "res.kap.", | ||||
|     "resp.", | ||||
|     "rv.", | ||||
|     "s.", | ||||
|     "s.d.", | ||||
|     "s.k.", | ||||
|     "s.u.", | ||||
|     "s.å.", | ||||
|     "sen.", | ||||
|     "sep.", | ||||
|     "siviling.", | ||||
|     "sms.", | ||||
|     "snr.", | ||||
|     "spm.", | ||||
|     "sr.", | ||||
|     "sst.", | ||||
|     "st.", | ||||
|     "st.meld.", | ||||
|     "st.prp.", | ||||
|     "stip.", | ||||
|     "stk.", | ||||
|     "stud.", | ||||
|     "sv.", | ||||
|     "såk.", | ||||
|     "sø.", | ||||
|     "t.d.", | ||||
|     "t.h.", | ||||
|     "t.o.m.", | ||||
|     "t.v.", | ||||
|     "temp.", | ||||
|     "ti.", | ||||
|     "tils.", | ||||
|     "tilsv.", | ||||
|     "tl;dr", | ||||
|     "tlf.", | ||||
|     "to.", | ||||
|     "ult.", | ||||
|     "utg.", | ||||
|     "v.", | ||||
|     "vedk.", | ||||
|     "vedr.", | ||||
|     "vg.", | ||||
|     "vgs.", | ||||
|     "vha.", | ||||
|     "vit.ass.", | ||||
|     "vn.", | ||||
|     "vol.", | ||||
|     "vs.", | ||||
|     "vsa.", | ||||
|     "§§", | ||||
|     "©NTB", | ||||
|     "årg.", | ||||
|     "årh.", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| # Dates | ||||
| for h in range(1, 31 + 1): | ||||
|     for period in ["."]: | ||||
|         _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] | ||||
| 
 | ||||
| _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]} | ||||
| _exc.update(_custom_base_exc) | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||
|  | @ -15,4 +15,7 @@ sentences = [ | |||
|     "Türkiye'nin başkenti neresi?", | ||||
|     "Bakanlar Kurulu 180 günlük eylem planını açıkladı.", | ||||
|     "Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.", | ||||
|     "Cemal Sureya kimdir?", | ||||
|     "Bunlari Biliyor muydunuz?", | ||||
|     "Altinoluk Turkiye haritasinin neresinde yer alir?", | ||||
| ] | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| import functools | ||||
| import inspect | ||||
| import itertools | ||||
| import multiprocessing as mp | ||||
| import random | ||||
|  | @ -64,6 +65,7 @@ from .util import ( | |||
|     registry, | ||||
|     warn_if_jupyter_cupy, | ||||
| ) | ||||
| from .vectors import BaseVectors | ||||
| from .vocab import Vocab, create_vocab | ||||
| 
 | ||||
| PipeCallable = Callable[[Doc], Doc] | ||||
|  | @ -153,6 +155,7 @@ class Language: | |||
|         max_length: int = 10**6, | ||||
|         meta: Dict[str, Any] = {}, | ||||
|         create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, | ||||
|         create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None, | ||||
|         batch_size: int = 1000, | ||||
|         **kwargs, | ||||
|     ) -> None: | ||||
|  | @ -192,6 +195,10 @@ class Language: | |||
|             raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) | ||||
|         if vocab is True: | ||||
|             vocab = create_vocab(self.lang, self.Defaults) | ||||
|             if not create_vectors: | ||||
|                 vectors_cfg = {"vectors": self._config["nlp"]["vectors"]} | ||||
|                 create_vectors = registry.resolve(vectors_cfg)["vectors"] | ||||
|             vocab.vectors = create_vectors(vocab) | ||||
|         else: | ||||
|             if (self.lang and vocab.lang) and (self.lang != vocab.lang): | ||||
|                 raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) | ||||
|  | @ -1878,6 +1885,10 @@ class Language: | |||
|             ).merge(config) | ||||
|         if "nlp" not in config: | ||||
|             raise ValueError(Errors.E985.format(config=config)) | ||||
|         # fill in [nlp.vectors] if not present (as a narrower alternative to | ||||
|         # auto-filling [nlp] from the default config) | ||||
|         if "vectors" not in config["nlp"]: | ||||
|             config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"} | ||||
|         config_lang = config["nlp"].get("lang") | ||||
|         if config_lang is not None and config_lang != cls.lang: | ||||
|             raise ValueError( | ||||
|  | @ -1913,6 +1924,7 @@ class Language: | |||
|             filled["nlp"], validate=validate, schema=ConfigSchemaNlp | ||||
|         ) | ||||
|         create_tokenizer = resolved_nlp["tokenizer"] | ||||
|         create_vectors = resolved_nlp["vectors"] | ||||
|         before_creation = resolved_nlp["before_creation"] | ||||
|         after_creation = resolved_nlp["after_creation"] | ||||
|         after_pipeline_creation = resolved_nlp["after_pipeline_creation"] | ||||
|  | @ -1933,7 +1945,12 @@ class Language: | |||
|         # inside stuff like the spacy train function. If we loaded them here, | ||||
|         # then we would load them twice at runtime: once when we make from config, | ||||
|         # and then again when we load from disk. | ||||
|         nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta) | ||||
|         nlp = lang_cls( | ||||
|             vocab=vocab, | ||||
|             create_tokenizer=create_tokenizer, | ||||
|             create_vectors=create_vectors, | ||||
|             meta=meta, | ||||
|         ) | ||||
|         if after_creation is not None: | ||||
|             nlp = after_creation(nlp) | ||||
|             if not isinstance(nlp, cls): | ||||
|  | @ -2150,8 +2167,20 @@ class Language: | |||
|             # Go over the listener layers and replace them | ||||
|             for listener in pipe_listeners: | ||||
|                 new_model = tok2vec_model.copy() | ||||
|                 if "replace_listener" in tok2vec_model.attrs: | ||||
|                     new_model = tok2vec_model.attrs["replace_listener"](new_model) | ||||
|                 replace_listener_func = tok2vec_model.attrs.get("replace_listener") | ||||
|                 if replace_listener_func is not None: | ||||
|                     # Pass the extra args to the callback without breaking compatibility with | ||||
|                     # old library versions that only expect a single parameter. | ||||
|                     num_params = len( | ||||
|                         inspect.signature(replace_listener_func).parameters | ||||
|                     ) | ||||
|                     if num_params == 1: | ||||
|                         new_model = replace_listener_func(new_model) | ||||
|                     elif num_params == 3: | ||||
|                         new_model = replace_listener_func(new_model, listener, tok2vec) | ||||
|                     else: | ||||
|                         raise ValueError(Errors.E1055.format(num_params=num_params)) | ||||
| 
 | ||||
|                 util.replace_model_node(pipe.model, listener, new_model)  # type: ignore[attr-defined] | ||||
|                 tok2vec.remove_listener(listener, pipe_name) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: embedsignature=True | ||||
| # cython: profile=False | ||||
| # Compiler crashes on memory view coercion without this. Should report bug. | ||||
| cimport numpy as np | ||||
| from libc.string cimport memset | ||||
|  |  | |||
|  | @ -3,4 +3,4 @@ from .levenshtein import levenshtein | |||
| from .matcher import Matcher | ||||
| from .phrasematcher import PhraseMatcher | ||||
| 
 | ||||
| __all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"] | ||||
| __all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"] | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| # cython: infer_types=True | ||||
| import warnings | ||||
| from collections import defaultdict | ||||
| from itertools import product | ||||
|  | @ -129,6 +129,7 @@ cdef class DependencyMatcher: | |||
|             else: | ||||
|                 required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"} | ||||
|                 relation_keys = set(relation.keys()) | ||||
|                 # Identify required keys that have not been specified | ||||
|                 missing = required_keys - relation_keys | ||||
|                 if missing: | ||||
|                     missing_txt = ", ".join(list(missing)) | ||||
|  | @ -136,6 +137,13 @@ cdef class DependencyMatcher: | |||
|                         required=required_keys, | ||||
|                         missing=missing_txt | ||||
|                     )) | ||||
|                 # Identify additional, unsupported keys | ||||
|                 unsupported = relation_keys - required_keys | ||||
|                 if unsupported: | ||||
|                     unsupported_txt = ", ".join(list(unsupported)) | ||||
|                     warnings.warn(Warnings.W126.format( | ||||
|                         unsupported=unsupported_txt | ||||
|                     )) | ||||
|                 if ( | ||||
|                     relation["RIGHT_ID"] in visited_nodes | ||||
|                     or relation["LEFT_ID"] not in visited_nodes | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: profile=True, binding=True, infer_types=True | ||||
| # cython: binding=True, infer_types=True | ||||
| from cpython.object cimport PyObject | ||||
| from libc.stdint cimport int64_t | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: binding=True, infer_types=True, profile=True | ||||
| # cython: binding=True, infer_types=True | ||||
| from typing import Iterable, List | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| # cython: infer_types=True | ||||
| from collections import defaultdict | ||||
| from typing import List | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,21 +1,27 @@ | |||
| from functools import partial | ||||
| from typing import List, Optional, cast | ||||
| from typing import List, Optional, Tuple, cast | ||||
| 
 | ||||
| from thinc.api import ( | ||||
|     Dropout, | ||||
|     Gelu, | ||||
|     LayerNorm, | ||||
|     Linear, | ||||
|     Logistic, | ||||
|     Maxout, | ||||
|     Model, | ||||
|     ParametricAttention, | ||||
|     ParametricAttention_v2, | ||||
|     Relu, | ||||
|     Softmax, | ||||
|     SparseLinear, | ||||
|     SparseLinear_v2, | ||||
|     chain, | ||||
|     clone, | ||||
|     concatenate, | ||||
|     list2ragged, | ||||
|     reduce_first, | ||||
|     reduce_last, | ||||
|     reduce_max, | ||||
|     reduce_mean, | ||||
|     reduce_sum, | ||||
|     residual, | ||||
|  | @ -25,9 +31,10 @@ from thinc.api import ( | |||
| ) | ||||
| from thinc.layers.chain import init as init_chain | ||||
| from thinc.layers.resizable import resize_linear_weighted, resize_model | ||||
| from thinc.types import Floats2d | ||||
| from thinc.types import ArrayXd, Floats2d | ||||
| 
 | ||||
| from ...attrs import ORTH | ||||
| from ...errors import Errors | ||||
| from ...tokens import Doc | ||||
| from ...util import registry | ||||
| from ..extract_ngrams import extract_ngrams | ||||
|  | @ -47,39 +54,15 @@ def build_simple_cnn_text_classifier( | |||
|     outputs sum to 1. If exclusive_classes=False, a logistic non-linearity | ||||
|     is applied instead, so that outputs are in the range [0, 1]. | ||||
|     """ | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         cnn = tok2vec >> list2ragged() >> reduce_mean() | ||||
|         nI = tok2vec.maybe_get_dim("nO") | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=nI) | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|             resizable_layer: Model = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO, nI=nI) | ||||
|             resizable_layer = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer >> Logistic() | ||||
|         model.set_ref("output_layer", output_layer) | ||||
|         model.attrs["resize_output"] = partial( | ||||
|             resize_and_set_ref, | ||||
|             resizable_layer=resizable_layer, | ||||
|         ) | ||||
|     model.set_ref("tok2vec", tok2vec) | ||||
|     if nO is not None: | ||||
|         model.set_dim("nO", cast(int, nO)) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
|     return model | ||||
|     return build_reduce_text_classifier( | ||||
|         tok2vec=tok2vec, | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         use_reduce_first=False, | ||||
|         use_reduce_last=False, | ||||
|         use_reduce_max=False, | ||||
|         use_reduce_mean=True, | ||||
|         nO=nO, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def resize_and_set_ref(model, new_nO, resizable_layer): | ||||
|  | @ -95,10 +78,48 @@ def build_bow_text_classifier( | |||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     return _build_bow_text_classifier( | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         ngram_size=ngram_size, | ||||
|         no_output_layer=no_output_layer, | ||||
|         nO=nO, | ||||
|         sparse_linear=SparseLinear(nO=nO), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatBOW.v3") | ||||
| def build_bow_text_classifier_v3( | ||||
|     exclusive_classes: bool, | ||||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     length: int = 262144, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     if length < 1: | ||||
|         raise ValueError(Errors.E1056.format(length=length)) | ||||
| 
 | ||||
|     # Find k such that 2**(k-1) < length <= 2**k. | ||||
|     length = 2 ** (length - 1).bit_length() | ||||
| 
 | ||||
|     return _build_bow_text_classifier( | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         ngram_size=ngram_size, | ||||
|         no_output_layer=no_output_layer, | ||||
|         nO=nO, | ||||
|         sparse_linear=SparseLinear_v2(nO=nO, length=length), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def _build_bow_text_classifier( | ||||
|     exclusive_classes: bool, | ||||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         sparse_linear = SparseLinear(nO=nO) | ||||
|         output_layer = None | ||||
|         if not no_output_layer: | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|  | @ -127,6 +148,9 @@ def build_text_classifier_v2( | |||
|     linear_model: Model[List[Doc], Floats2d], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     # TODO: build the model with _build_parametric_attention_with_residual_nonlinear | ||||
|     # in spaCy v4. We don't do this in spaCy v3 to preserve model | ||||
|     # compatibility. | ||||
|     exclusive_classes = not linear_model.attrs["multi_label"] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate}): | ||||
|         width = tok2vec.maybe_get_dim("nO") | ||||
|  | @ -190,3 +214,145 @@ def build_text_classifier_lowdata( | |||
|             model = model >> Dropout(dropout) | ||||
|         model = model >> Logistic() | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatParametricAttention.v1") | ||||
| def build_textcat_parametric_attention_v1( | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]], | ||||
|     exclusive_classes: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     width = tok2vec.maybe_get_dim("nO") | ||||
|     parametric_attention = _build_parametric_attention_with_residual_nonlinear( | ||||
|         tok2vec=tok2vec, | ||||
|         nonlinear_layer=Maxout(nI=width, nO=width), | ||||
|         key_transform=Gelu(nI=width, nO=width), | ||||
|     ) | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO) | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO) >> Logistic() | ||||
|         model = parametric_attention >> output_layer | ||||
|     if model.has_dim("nO") is not False and nO is not None: | ||||
|         model.set_dim("nO", cast(int, nO)) | ||||
|     model.set_ref("output_layer", output_layer) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
| 
 | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| def _build_parametric_attention_with_residual_nonlinear( | ||||
|     *, | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]], | ||||
|     nonlinear_layer: Model[Floats2d, Floats2d], | ||||
|     key_transform: Optional[Model[Floats2d, Floats2d]] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     with Model.define_operators({">>": chain, "|": concatenate}): | ||||
|         width = tok2vec.maybe_get_dim("nO") | ||||
|         attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform) | ||||
|         norm_layer = LayerNorm(nI=width) | ||||
|         parametric_attention = ( | ||||
|             tok2vec | ||||
|             >> list2ragged() | ||||
|             >> attention_layer | ||||
|             >> reduce_sum() | ||||
|             >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0)) | ||||
|         ) | ||||
| 
 | ||||
|         parametric_attention.init = _init_parametric_attention_with_residual_nonlinear | ||||
| 
 | ||||
|         parametric_attention.set_ref("tok2vec", tok2vec) | ||||
|         parametric_attention.set_ref("attention_layer", attention_layer) | ||||
|         parametric_attention.set_ref("nonlinear_layer", nonlinear_layer) | ||||
|         parametric_attention.set_ref("norm_layer", norm_layer) | ||||
| 
 | ||||
|         return parametric_attention | ||||
| 
 | ||||
| 
 | ||||
| def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: | ||||
|     tok2vec_width = get_tok2vec_width(model) | ||||
|     model.get_ref("attention_layer").set_dim("nO", tok2vec_width) | ||||
|     model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) | ||||
|     model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width) | ||||
|     model.get_ref("norm_layer").set_dim("nI", tok2vec_width) | ||||
|     model.get_ref("norm_layer").set_dim("nO", tok2vec_width) | ||||
|     init_chain(model, X, Y) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatReduce.v1") | ||||
| def build_reduce_text_classifier( | ||||
|     tok2vec: Model, | ||||
|     exclusive_classes: bool, | ||||
|     use_reduce_first: bool, | ||||
|     use_reduce_last: bool, | ||||
|     use_reduce_max: bool, | ||||
|     use_reduce_mean: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     """Build a model that classifies pooled `Doc` representations. | ||||
| 
 | ||||
|     Pooling is performed using reductions. Reductions are concatenated when | ||||
|     multiple reductions are used. | ||||
| 
 | ||||
|     tok2vec (Model): the tok2vec layer to pool over. | ||||
|     exclusive_classes (bool): Whether or not classes are mutually exclusive. | ||||
|     use_reduce_first (bool): Pool by using the hidden representation of the | ||||
|         first token of a `Doc`. | ||||
|     use_reduce_last (bool): Pool by using the hidden representation of the | ||||
|         last token of a `Doc`. | ||||
|     use_reduce_max (bool): Pool by taking the maximum values of the hidden | ||||
|         representations of a `Doc`. | ||||
|     use_reduce_mean (bool): Pool by taking the mean of all hidden | ||||
|         representations of a `Doc`. | ||||
|     nO (Optional[int]): Number of classes. | ||||
|     """ | ||||
| 
 | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     reductions = [] | ||||
|     if use_reduce_first: | ||||
|         reductions.append(reduce_first()) | ||||
|     if use_reduce_last: | ||||
|         reductions.append(reduce_last()) | ||||
|     if use_reduce_max: | ||||
|         reductions.append(reduce_max()) | ||||
|     if use_reduce_mean: | ||||
|         reductions.append(reduce_mean()) | ||||
| 
 | ||||
|     if not len(reductions): | ||||
|         raise ValueError(Errors.E1057) | ||||
| 
 | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         cnn = tok2vec >> list2ragged() >> concatenate(*reductions) | ||||
|         nO_tok2vec = tok2vec.maybe_get_dim("nO") | ||||
|         nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=nI) | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|             resizable_layer: Model = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO, nI=nI) | ||||
|             resizable_layer = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer >> Logistic() | ||||
|         model.set_ref("output_layer", output_layer) | ||||
|         model.attrs["resize_output"] = partial( | ||||
|             resize_and_set_ref, | ||||
|             resizable_layer=resizable_layer, | ||||
|         ) | ||||
|     model.set_ref("tok2vec", tok2vec) | ||||
|     if nO is not None: | ||||
|         model.set_dim("nO", cast(int, nO)) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
|     return model | ||||
|  |  | |||
|  | @ -67,8 +67,8 @@ def build_hash_embed_cnn_tok2vec( | |||
|         are between 2 and 8. | ||||
|     window_size (int): The number of tokens on either side to concatenate during | ||||
|         the convolutions. The receptive field of the CNN will be | ||||
|         depth * (window_size * 2 + 1), so a 4-layer network with window_size of | ||||
|         2 will be sensitive to 20 words at a time. Recommended value is 1. | ||||
|         depth * window_size * 2 + 1, so a 4-layer network with window_size of | ||||
|         2 will be sensitive to 17 words at a time. Recommended value is 1. | ||||
|     embed_size (int): The number of rows in the hash embedding tables. This can | ||||
|         be surprisingly small, due to the use of the hash embeddings. Recommended | ||||
|         values are between 2000 and 10000. | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types=True, cdivision=True, boundscheck=False | ||||
| # cython: profile=False | ||||
| cimport numpy as np | ||||
| from libc.math cimport exp | ||||
| from libc.stdlib cimport calloc, free, realloc | ||||
|  |  | |||
|  | @ -9,7 +9,7 @@ from thinc.util import partial | |||
| from ..attrs import ORTH | ||||
| from ..errors import Errors, Warnings | ||||
| from ..tokens import Doc | ||||
| from ..vectors import Mode | ||||
| from ..vectors import Mode, Vectors | ||||
| from ..vocab import Vocab | ||||
| 
 | ||||
| 
 | ||||
|  | @ -48,11 +48,14 @@ def forward( | |||
|     key_attr: int = getattr(vocab.vectors, "attr", ORTH) | ||||
|     keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs]) | ||||
|     W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) | ||||
|     if vocab.vectors.mode == Mode.default: | ||||
|     if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default: | ||||
|         V = model.ops.asarray(vocab.vectors.data) | ||||
|         rows = vocab.vectors.find(keys=keys) | ||||
|         V = model.ops.as_contig(V[rows]) | ||||
|     elif vocab.vectors.mode == Mode.floret: | ||||
|     elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret: | ||||
|         V = vocab.vectors.get_batch(keys) | ||||
|         V = model.ops.as_contig(V) | ||||
|     elif hasattr(vocab.vectors, "get_batch"): | ||||
|         V = vocab.vectors.get_batch(keys) | ||||
|         V = model.ops.as_contig(V) | ||||
|     else: | ||||
|  | @ -61,7 +64,7 @@ def forward( | |||
|         vectors_data = model.ops.gemm(V, W, trans2=True) | ||||
|     except ValueError: | ||||
|         raise RuntimeError(Errors.E896) | ||||
|     if vocab.vectors.mode == Mode.default: | ||||
|     if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default: | ||||
|         # Convert negative indices to 0-vectors | ||||
|         # TODO: more options for UNK tokens | ||||
|         vectors_data[rows < 0] = 0 | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types | ||||
| # cython: profile=False | ||||
| import warnings | ||||
| from typing import Dict, List, Optional, Tuple, Union | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| 
 | ||||
| # cython: profile=False | ||||
| IDS = { | ||||
|     "": NO_TAG, | ||||
|     "ADJ": ADJ, | ||||
|  |  | |||
|  | @ -21,6 +21,7 @@ from .trainable_pipe import TrainablePipe | |||
| __all__ = [ | ||||
|     "AttributeRuler", | ||||
|     "DependencyParser", | ||||
|     "EditTreeLemmatizer", | ||||
|     "EntityLinker", | ||||
|     "EntityRecognizer", | ||||
|     "Morphologizer", | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types=True, binding=True | ||||
| # cython: profile=False | ||||
| from cython.operator cimport dereference as deref | ||||
| from libc.stdint cimport UINT32_MAX, uint32_t | ||||
| from libc.string cimport memset | ||||
|  |  | |||
|  | @ -1,8 +1,12 @@ | |||
| from collections import defaultdict | ||||
| from typing import Any, Dict, List, Union | ||||
| 
 | ||||
| from pydantic import BaseModel, Field, ValidationError | ||||
| from pydantic.types import StrictBool, StrictInt, StrictStr | ||||
| try: | ||||
|     from pydantic.v1 import BaseModel, Field, ValidationError | ||||
|     from pydantic.v1.types import StrictBool, StrictInt, StrictStr | ||||
| except ImportError: | ||||
|     from pydantic import BaseModel, Field, ValidationError  # type: ignore | ||||
|     from pydantic.types import StrictBool, StrictInt, StrictStr  # type: ignore | ||||
| 
 | ||||
| 
 | ||||
| class MatchNodeSchema(BaseModel): | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| # cython: infer_types=True | ||||
| # cython: profile=True | ||||
| import numpy | ||||
| 
 | ||||
| from ...typedefs cimport class_t | ||||
|  |  | |||
|  | @ -0,0 +1 @@ | |||
| # cython: profile=False | ||||
|  | @ -1,4 +1,4 @@ | |||
| # cython: profile=True, cdivision=True, infer_types=True | ||||
| # cython: cdivision=True, infer_types=True | ||||
| from cymem.cymem cimport Address, Pool | ||||
| from libc.stdint cimport int32_t | ||||
| from libcpp.vector cimport vector | ||||
|  |  | |||
|  | @ -1,3 +1,4 @@ | |||
| # cython: profile=False | ||||
| from cymem.cymem cimport Pool | ||||
| from libcpp.memory cimport shared_ptr | ||||
| from libcpp.vector cimport vector | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: profile=True, infer_types=True | ||||
| # cython: infer_types=True | ||||
| """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 | ||||
| for doing pseudo-projective parsing implementation uses the HEAD decoration | ||||
| scheme. | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types=True | ||||
| # cython: profile=False | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| from ...tokens.doc cimport Doc | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types=True | ||||
| # cython: profile=False | ||||
| from __future__ import print_function | ||||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from collections import defaultdict | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from itertools import islice | ||||
| from typing import Callable, Dict, Iterable, Optional, Union | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from collections import defaultdict | ||||
| from typing import Callable, Optional | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from typing import Callable, Dict, Iterable, Iterator, Tuple, Union | ||||
| 
 | ||||
| import srsly | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from typing import Callable, List, Optional | ||||
| 
 | ||||
| import srsly | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from itertools import islice | ||||
| from typing import Callable, Iterable, Optional | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from itertools import islice | ||||
| from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union | ||||
| 
 | ||||
|  |  | |||
|  | @ -39,8 +39,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  | @ -48,16 +49,21 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m | |||
| 
 | ||||
| single_label_bow_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
| 
 | ||||
| single_label_cnn_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatCNN.v2" | ||||
| @architectures = "spacy.TextCatReduce.v1" | ||||
| exclusive_classes = true | ||||
| use_reduce_first = false | ||||
| use_reduce_last = false | ||||
| use_reduce_max = false | ||||
| use_reduce_mean = true | ||||
| 
 | ||||
| [model.tok2vec] | ||||
| @architectures = "spacy.HashEmbedCNN.v2" | ||||
|  |  | |||
|  | @ -35,8 +35,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  | @ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod | |||
| 
 | ||||
| multi_label_bow_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
|  | @ -52,8 +53,12 @@ no_output_layer = false | |||
| 
 | ||||
| multi_label_cnn_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatCNN.v2" | ||||
| @architectures = "spacy.TextCatReduce.v1" | ||||
| exclusive_classes = false | ||||
| use_reduce_first = false | ||||
| use_reduce_last = false | ||||
| use_reduce_max = false | ||||
| use_reduce_mean = true | ||||
| 
 | ||||
| [model.tok2vec] | ||||
| @architectures = "spacy.HashEmbedCNN.v2" | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| # cython: infer_types=True, binding=True | ||||
| from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple | ||||
| 
 | ||||
| import srsly | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True | ||||
| # cython: profile=False | ||||
| from __future__ import print_function | ||||
| 
 | ||||
| from typing import Dict, Iterable, List, Optional, Tuple | ||||
|  |  | |||
							
								
								
									
										102
									
								
								spacy/schemas.py
									
									
									
									
									
								
							
							
						
						
									
										102
									
								
								spacy/schemas.py
									
									
									
									
									
								
							|  | @ -17,19 +17,34 @@ from typing import ( | |||
|     Union, | ||||
| ) | ||||
| 
 | ||||
| from pydantic import ( | ||||
|     BaseModel, | ||||
|     ConstrainedStr, | ||||
|     Field, | ||||
|     StrictBool, | ||||
|     StrictFloat, | ||||
|     StrictInt, | ||||
|     StrictStr, | ||||
|     ValidationError, | ||||
|     create_model, | ||||
|     validator, | ||||
| ) | ||||
| from pydantic.main import ModelMetaclass | ||||
| try: | ||||
|     from pydantic.v1 import ( | ||||
|         BaseModel, | ||||
|         ConstrainedStr, | ||||
|         Field, | ||||
|         StrictBool, | ||||
|         StrictFloat, | ||||
|         StrictInt, | ||||
|         StrictStr, | ||||
|         ValidationError, | ||||
|         create_model, | ||||
|         validator, | ||||
|     ) | ||||
|     from pydantic.v1.main import ModelMetaclass | ||||
| except ImportError: | ||||
|     from pydantic import (  # type: ignore | ||||
|         BaseModel, | ||||
|         ConstrainedStr, | ||||
|         Field, | ||||
|         StrictBool, | ||||
|         StrictFloat, | ||||
|         StrictInt, | ||||
|         StrictStr, | ||||
|         ValidationError, | ||||
|         create_model, | ||||
|         validator, | ||||
|     ) | ||||
|     from pydantic.main import ModelMetaclass  # type: ignore | ||||
| from thinc.api import ConfigValidationError, Model, Optimizer | ||||
| from thinc.config import Promise | ||||
| 
 | ||||
|  | @ -397,6 +412,7 @@ class ConfigSchemaNlp(BaseModel): | |||
|     after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") | ||||
|     after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") | ||||
|     batch_size: Optional[int] = Field(..., title="Default batch size") | ||||
|     vectors: Callable = Field(..., title="Vectors implementation") | ||||
|     # fmt: on | ||||
| 
 | ||||
|     class Config: | ||||
|  | @ -488,66 +504,6 @@ CONFIG_SCHEMAS = { | |||
|     "distillation": ConfigSchemaDistill, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| # Project config Schema | ||||
| 
 | ||||
| 
 | ||||
| class ProjectConfigAssetGitItem(BaseModel): | ||||
|     # fmt: off | ||||
|     repo: StrictStr = Field(..., title="URL of Git repo to download from") | ||||
|     path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)") | ||||
|     branch: StrictStr = Field("master", title="Branch to clone from") | ||||
|     # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| class ProjectConfigAssetURL(BaseModel): | ||||
|     # fmt: off | ||||
|     dest: StrictStr = Field(..., title="Destination of downloaded asset") | ||||
|     url: Optional[StrictStr] = Field(None, title="URL of asset") | ||||
|     checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") | ||||
|     description: StrictStr = Field("", title="Description of asset") | ||||
|     # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| class ProjectConfigAssetGit(BaseModel): | ||||
|     # fmt: off | ||||
|     git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") | ||||
|     checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") | ||||
|     description: Optional[StrictStr] = Field(None, title="Description of asset") | ||||
|     # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| class ProjectConfigCommand(BaseModel): | ||||
|     # fmt: off | ||||
|     name: StrictStr = Field(..., title="Name of command") | ||||
|     help: Optional[StrictStr] = Field(None, title="Command description") | ||||
|     script: List[StrictStr] = Field([], title="List of CLI commands to run, in order") | ||||
|     deps: List[StrictStr] = Field([], title="File dependencies required by this command") | ||||
|     outputs: List[StrictStr] = Field([], title="Outputs produced by this command") | ||||
|     outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)") | ||||
|     no_skip: bool = Field(False, title="Never skip this command, even if nothing changed") | ||||
|     # fmt: on | ||||
| 
 | ||||
|     class Config: | ||||
|         title = "A single named command specified in a project config" | ||||
|         extra = "forbid" | ||||
| 
 | ||||
| 
 | ||||
| class ProjectConfigSchema(BaseModel): | ||||
|     # fmt: off | ||||
|     vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") | ||||
|     env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names") | ||||
|     assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") | ||||
|     workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") | ||||
|     commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") | ||||
|     title: Optional[str] = Field(None, title="Project title") | ||||
|     spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with") | ||||
|     # fmt: on | ||||
| 
 | ||||
|     class Config: | ||||
|         title = "Schema for project configuration file" | ||||
| 
 | ||||
| 
 | ||||
| # Recommendations for init config workflows | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										138
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						
									
										138
									
								
								spacy/scorer.py
									
									
									
									
									
								
							|  | @ -802,6 +802,140 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: | |||
|         } | ||||
| 
 | ||||
| 
 | ||||
| # The following implementation of trapezoid() is adapted from SciPy, | ||||
| # which is distributed under the New BSD License. | ||||
| # Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers. | ||||
| # See licenses/3rd_party_licenses.txt | ||||
| def trapezoid(y, x=None, dx=1.0, axis=-1): | ||||
|     r""" | ||||
|     Integrate along the given axis using the composite trapezoidal rule. | ||||
| 
 | ||||
|     If `x` is provided, the integration happens in sequence along its | ||||
|     elements - they are not sorted. | ||||
| 
 | ||||
|     Integrate `y` (`x`) along each 1d slice on the given axis, compute | ||||
|     :math:`\int y(x) dx`. | ||||
|     When `x` is specified, this integrates along the parametric curve, | ||||
|     computing :math:`\int_t y(t) dt = | ||||
|     \int_t y(t) \left.\frac{dx}{dt}\right|_{x=x(t)} dt`. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     y : array_like | ||||
|         Input array to integrate. | ||||
|     x : array_like, optional | ||||
|         The sample points corresponding to the `y` values. If `x` is None, | ||||
|         the sample points are assumed to be evenly spaced `dx` apart. The | ||||
|         default is None. | ||||
|     dx : scalar, optional | ||||
|         The spacing between sample points when `x` is None. The default is 1. | ||||
|     axis : int, optional | ||||
|         The axis along which to integrate. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     trapezoid : float or ndarray | ||||
|         Definite integral of `y` = n-dimensional array as approximated along | ||||
|         a single axis by the trapezoidal rule. If `y` is a 1-dimensional array, | ||||
|         then the result is a float. If `n` is greater than 1, then the result | ||||
|         is an `n`-1 dimensional array. | ||||
| 
 | ||||
|     See Also | ||||
|     -------- | ||||
|     cumulative_trapezoid, simpson, romb | ||||
| 
 | ||||
|     Notes | ||||
|     ----- | ||||
|     Image [2]_ illustrates trapezoidal rule -- y-axis locations of points | ||||
|     will be taken from `y` array, by default x-axis distances between | ||||
|     points will be 1.0, alternatively they can be provided with `x` array | ||||
|     or with `dx` scalar.  Return value will be equal to combined area under | ||||
|     the red lines. | ||||
| 
 | ||||
|     References | ||||
|     ---------- | ||||
|     .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule | ||||
| 
 | ||||
|     .. [2] Illustration image: | ||||
|            https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     Use the trapezoidal rule on evenly spaced points: | ||||
| 
 | ||||
|     >>> import numpy as np | ||||
|     >>> from scipy import integrate | ||||
|     >>> integrate.trapezoid([1, 2, 3]) | ||||
|     4.0 | ||||
| 
 | ||||
|     The spacing between sample points can be selected by either the | ||||
|     ``x`` or ``dx`` arguments: | ||||
| 
 | ||||
|     >>> integrate.trapezoid([1, 2, 3], x=[4, 6, 8]) | ||||
|     8.0 | ||||
|     >>> integrate.trapezoid([1, 2, 3], dx=2) | ||||
|     8.0 | ||||
| 
 | ||||
|     Using a decreasing ``x`` corresponds to integrating in reverse: | ||||
| 
 | ||||
|     >>> integrate.trapezoid([1, 2, 3], x=[8, 6, 4]) | ||||
|     -8.0 | ||||
| 
 | ||||
|     More generally ``x`` is used to integrate along a parametric curve. We can | ||||
|     estimate the integral :math:`\int_0^1 x^2 = 1/3` using: | ||||
| 
 | ||||
|     >>> x = np.linspace(0, 1, num=50) | ||||
|     >>> y = x**2 | ||||
|     >>> integrate.trapezoid(y, x) | ||||
|     0.33340274885464394 | ||||
| 
 | ||||
|     Or estimate the area of a circle, noting we repeat the sample which closes | ||||
|     the curve: | ||||
| 
 | ||||
|     >>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True) | ||||
|     >>> integrate.trapezoid(np.cos(theta), x=np.sin(theta)) | ||||
|     3.141571941375841 | ||||
| 
 | ||||
|     ``trapezoid`` can be applied along a specified axis to do multiple | ||||
|     computations in one call: | ||||
| 
 | ||||
|     >>> a = np.arange(6).reshape(2, 3) | ||||
|     >>> a | ||||
|     array([[0, 1, 2], | ||||
|            [3, 4, 5]]) | ||||
|     >>> integrate.trapezoid(a, axis=0) | ||||
|     array([1.5, 2.5, 3.5]) | ||||
|     >>> integrate.trapezoid(a, axis=1) | ||||
|     array([2.,  8.]) | ||||
|     """ | ||||
|     y = np.asanyarray(y) | ||||
|     if x is None: | ||||
|         d = dx | ||||
|     else: | ||||
|         x = np.asanyarray(x) | ||||
|         if x.ndim == 1: | ||||
|             d = np.diff(x) | ||||
|             # reshape to correct shape | ||||
|             shape = [1] * y.ndim | ||||
|             shape[axis] = d.shape[0] | ||||
|             d = d.reshape(shape) | ||||
|         else: | ||||
|             d = np.diff(x, axis=axis) | ||||
|     nd = y.ndim | ||||
|     slice1 = [slice(None)] * nd | ||||
|     slice2 = [slice(None)] * nd | ||||
|     slice1[axis] = slice(1, None) | ||||
|     slice2[axis] = slice(None, -1) | ||||
|     try: | ||||
|         ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis) | ||||
|     except ValueError: | ||||
|         # Operations didn't work, cast to ndarray | ||||
|         d = np.asarray(d) | ||||
|         y = np.asarray(y) | ||||
|         ret = np.add.reduce(d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0, axis) | ||||
|     return ret | ||||
| 
 | ||||
| 
 | ||||
| # The following implementation of roc_auc_score() is adapted from | ||||
| # scikit-learn, which is distributed under the New BSD License. | ||||
| # Copyright (c) 2007–2019 The scikit-learn developers. | ||||
|  | @ -1024,9 +1158,9 @@ def _auc(x, y): | |||
|         else: | ||||
|             raise ValueError(Errors.E164.format(x=x)) | ||||
| 
 | ||||
|     area = direction * np.trapz(y, x) | ||||
|     area = direction * trapezoid(y, x) | ||||
|     if isinstance(area, np.memmap): | ||||
|         # Reductions such as .sum used internally in np.trapz do not return a | ||||
|         # Reductions such as .sum used internally in trapezoid do not return a | ||||
|         # scalar by default for numpy.memmap instances contrary to | ||||
|         # regular numpy.ndarray instances. | ||||
|         area = area.dtype.type(area) | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: infer_types=True | ||||
| # cython: profile=False | ||||
| from typing import Iterable, Iterator, List, Optional, Tuple, Union | ||||
| 
 | ||||
| from libc.stdint cimport uint32_t | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: optimize.unpack_method_calls=False | ||||
| # cython: profile=False | ||||
| IDS = { | ||||
|     "": NIL, | ||||
|     "IS_ALPHA": IS_ALPHA, | ||||
|  |  | |||
|  | @ -194,6 +194,11 @@ def fi_tokenizer(): | |||
|     return get_lang_class("fi")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def fo_tokenizer(): | ||||
|     return get_lang_class("fo")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def fr_tokenizer(): | ||||
|     return get_lang_class("fr")().tokenizer | ||||
|  | @ -363,6 +368,11 @@ def nl_tokenizer(): | |||
|     return get_lang_class("nl")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def nn_tokenizer(): | ||||
|     return get_lang_class("nn")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def pl_tokenizer(): | ||||
|     return get_lang_class("pl")().tokenizer | ||||
|  |  | |||
|  | @ -783,3 +783,12 @@ def test_for_no_ent_sents(): | |||
|     sents = list(doc.ents[0].sents) | ||||
|     assert len(sents) == 1 | ||||
|     assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" | ||||
| 
 | ||||
| 
 | ||||
| def test_span_api_richcmp_other(en_tokenizer): | ||||
|     doc1 = en_tokenizer("a b") | ||||
|     doc2 = en_tokenizer("b c") | ||||
|     assert not doc1[1:2] == doc1[1] | ||||
|     assert not doc1[1:2] == doc2[0] | ||||
|     assert not doc1[1:2] == doc2[0:1] | ||||
|     assert not doc1[0:1] == doc2 | ||||
|  |  | |||
|  | @ -294,3 +294,12 @@ def test_missing_head_dep(en_vocab): | |||
|     assert aligned_heads[0] == ref_heads[0] | ||||
|     assert aligned_deps[5] == ref_deps[5] | ||||
|     assert aligned_heads[5] == ref_heads[5] | ||||
| 
 | ||||
| 
 | ||||
| def test_token_api_richcmp_other(en_tokenizer): | ||||
|     doc1 = en_tokenizer("a b") | ||||
|     doc2 = en_tokenizer("b c") | ||||
|     assert not doc1[1] == doc1[0:1] | ||||
|     assert not doc1[1] == doc2[1:2] | ||||
|     assert not doc1[1] == doc2[0] | ||||
|     assert not doc1[0] == doc2 | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										26
									
								
								spacy/tests/lang/fo/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								spacy/tests/lang/fo/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| import pytest | ||||
| 
 | ||||
| # examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/) | ||||
| # fmt: off | ||||
| FO_TOKEN_EXCEPTION_TESTS = [ | ||||
|     ( | ||||
|         "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", | ||||
|         [ | ||||
|             "Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.", | ||||
|         [ | ||||
|             "Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".", | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS) | ||||
| def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens): | ||||
|     tokens = fo_tokenizer(text) | ||||
|     token_list = [token.text for token in tokens if not token.is_space] | ||||
|     assert expected_tokens == token_list | ||||
							
								
								
									
										0
									
								
								spacy/tests/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										38
									
								
								spacy/tests/lang/nn/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								spacy/tests/lang/nn/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,38 @@ | |||
| import pytest | ||||
| 
 | ||||
| # examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) | ||||
| # fmt: off | ||||
| NN_TOKEN_EXCEPTION_TESTS = [ | ||||
|     ( | ||||
|         "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", | ||||
|         [ | ||||
|             "Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.", | ||||
|         [ | ||||
|             "Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.", | ||||
|         [ | ||||
|             "Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Brukssesongen er frå nov. til mai, med ein topp i mars.", | ||||
|         [ | ||||
|             "Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".", | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS) | ||||
| def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens): | ||||
|     tokens = nn_tokenizer(text) | ||||
|     token_list = [token.text for token in tokens if not token.is_space] | ||||
|     assert expected_tokens == token_list | ||||
|  | @ -216,6 +216,11 @@ def test_dependency_matcher_pattern_validation(en_vocab): | |||
|         pattern2 = copy.deepcopy(pattern) | ||||
|         pattern2[1]["RIGHT_ID"] = "fox" | ||||
|         matcher.add("FOUNDED", [pattern2]) | ||||
|     # invalid key | ||||
|     with pytest.warns(UserWarning): | ||||
|         pattern2 = copy.deepcopy(pattern) | ||||
|         pattern2[1]["FOO"] = "BAR" | ||||
|         matcher.add("FOUNDED", [pattern2]) | ||||
| 
 | ||||
| 
 | ||||
| def test_dependency_matcher_callback(en_vocab, doc): | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ from pathlib import Path | |||
| def test_build_dependencies(): | ||||
|     # Check that library requirements are pinned exactly the same across different setup files. | ||||
|     libs_ignore_requirements = [ | ||||
|         "numpy", | ||||
|         "pytest", | ||||
|         "pytest-timeout", | ||||
|         "mock", | ||||
|  | @ -22,6 +23,7 @@ def test_build_dependencies(): | |||
|     ] | ||||
|     # ignore language-specific packages that shouldn't be installed by all | ||||
|     libs_ignore_setup = [ | ||||
|         "numpy", | ||||
|         "fugashi", | ||||
|         "mecab-ko", | ||||
|         "pythainlp", | ||||
|  |  | |||
|  | @ -1,5 +1,10 @@ | |||
| import pytest | ||||
| from pydantic import StrictBool | ||||
| 
 | ||||
| try: | ||||
|     from pydantic.v1 import StrictBool | ||||
| except ImportError: | ||||
|     from pydantic import StrictBool  # type: ignore | ||||
| 
 | ||||
| from thinc.api import ConfigValidationError | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
|  |  | |||
|  | @ -1,5 +1,10 @@ | |||
| import pytest | ||||
| from pydantic import StrictInt, StrictStr | ||||
| 
 | ||||
| try: | ||||
|     from pydantic.v1 import StrictInt, StrictStr | ||||
| except ImportError: | ||||
|     from pydantic import StrictInt, StrictStr  # type: ignore | ||||
| 
 | ||||
| from thinc.api import ConfigValidationError, Linear, Model | ||||
| 
 | ||||
| import spacy | ||||
|  | @ -198,7 +203,7 @@ def test_pipe_class_component_model(): | |||
|             "@architectures": "spacy.TextCatEnsemble.v2", | ||||
|             "tok2vec": DEFAULT_TOK2VEC_MODEL, | ||||
|             "linear_model": { | ||||
|                 "@architectures": "spacy.TextCatBOW.v2", | ||||
|                 "@architectures": "spacy.TextCatBOW.v3", | ||||
|                 "exclusive_classes": False, | ||||
|                 "ngram_size": 1, | ||||
|                 "no_output_layer": False, | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user