Merge pull request #13287 from explosion/docs/llm_main
Sync `docs/llm_develop` with `docs/llm_main`
							
								
								
									
										1
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
 | 
				
			||||||
							
								
								
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						| 
						 | 
					@ -58,7 +58,7 @@ jobs:
 | 
				
			||||||
      fail-fast: true
 | 
					      fail-fast: true
 | 
				
			||||||
      matrix:
 | 
					      matrix:
 | 
				
			||||||
        os: [ubuntu-latest, windows-latest, macos-latest]
 | 
					        os: [ubuntu-latest, windows-latest, macos-latest]
 | 
				
			||||||
        python_version: ["3.11", "3.12.0-rc.2"]
 | 
					        python_version: ["3.12"]
 | 
				
			||||||
        include:
 | 
					        include:
 | 
				
			||||||
          - os: windows-latest
 | 
					          - os: windows-latest
 | 
				
			||||||
            python_version: "3.7"
 | 
					            python_version: "3.7"
 | 
				
			||||||
| 
						 | 
					@ -68,6 +68,8 @@ jobs:
 | 
				
			||||||
            python_version: "3.9"
 | 
					            python_version: "3.9"
 | 
				
			||||||
          - os: windows-latest
 | 
					          - os: windows-latest
 | 
				
			||||||
            python_version: "3.10"
 | 
					            python_version: "3.10"
 | 
				
			||||||
 | 
					          - os: macos-latest
 | 
				
			||||||
 | 
					            python_version: "3.11"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    runs-on: ${{ matrix.os }}
 | 
					    runs-on: ${{ matrix.os }}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
The MIT License (MIT)
 | 
					The MIT License (MIT)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
 | 
					Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
					Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
				
			||||||
of this software and associated documentation files (the "Software"), to deal
 | 
					of this software and associated documentation files (the "Software"), to deal
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										11
									
								
								README.md
									
									
									
									
									
								
							
							
						
						| 
						 | 
					@ -39,28 +39,35 @@ open-source software, released under the
 | 
				
			||||||
| 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                                               |
 | 
					| 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                                               |
 | 
				
			||||||
| 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                                          |
 | 
					| 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                                          |
 | 
				
			||||||
| 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                                      |
 | 
					| 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                                      |
 | 
				
			||||||
 | 
					| ⏩ **[GPU Processing]**                                                                                                                                                                                                    | Use spaCy with CUDA-compatible GPU processing.                                                                                                                                                                                                                                                                                               |
 | 
				
			||||||
| 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                                        |
 | 
					| 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                                        |
 | 
				
			||||||
 | 
					| 🦙 **[Large Language Models]**                                                                                                                                                                                            | Integrate LLMs into spaCy pipelines.                                                                                                                                                                                                                                                                                                        |
 | 
				
			||||||
| 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                                               |
 | 
					| 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                                               |
 | 
				
			||||||
| ⚙️ **[spaCy VS Code Extension]**                                                                                                                                                                                          | Additional tooling and features for working with spaCy's config files.                                                                                                                                                                                                                                                                       |
 | 
					| ⚙️ **[spaCy VS Code Extension]**                                                                                                                                                                                          | Additional tooling and features for working with spaCy's config files.                                                                                                                                                                                                                                                                       |
 | 
				
			||||||
| 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                                      |
 | 
					| 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                                      |
 | 
				
			||||||
 | 
					| 📰 **[Blog]**                                                                                                                                                                                                             | Read about current spaCy and Prodigy development, releases, talks and more from Explosion.                                                                                                                                                                                                                 |
 | 
				
			||||||
| 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    |
 | 
					| 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    |
 | 
				
			||||||
| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 |
 | 
					| 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 |
 | 
				
			||||||
| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        |
 | 
					| 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        |
 | 
				
			||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)**                 |
 | 
					| 👕 **[Swag]**                                                                                                                                                                                                             | Support us and our work with unique, custom-designed swag!                                                                                                                                                                                                                                                                                   |
 | 
				
			||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a>   | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
 | 
					| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)**                 |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[spacy 101]: https://spacy.io/usage/spacy-101
 | 
					[spacy 101]: https://spacy.io/usage/spacy-101
 | 
				
			||||||
[new in v3.0]: https://spacy.io/usage/v3
 | 
					[new in v3.0]: https://spacy.io/usage/v3
 | 
				
			||||||
[usage guides]: https://spacy.io/usage/
 | 
					[usage guides]: https://spacy.io/usage/
 | 
				
			||||||
[api reference]: https://spacy.io/api/
 | 
					[api reference]: https://spacy.io/api/
 | 
				
			||||||
 | 
					[gpu processing]: https://spacy.io/usage#gpu
 | 
				
			||||||
[models]: https://spacy.io/models
 | 
					[models]: https://spacy.io/models
 | 
				
			||||||
 | 
					[large language models]: https://spacy.io/usage/large-language-models
 | 
				
			||||||
[universe]: https://spacy.io/universe
 | 
					[universe]: https://spacy.io/universe
 | 
				
			||||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
 | 
					[spacy vs code extension]: https://github.com/explosion/spacy-vscode
 | 
				
			||||||
[videos]: https://www.youtube.com/c/ExplosionAI
 | 
					[videos]: https://www.youtube.com/c/ExplosionAI
 | 
				
			||||||
[online course]: https://course.spacy.io
 | 
					[online course]: https://course.spacy.io
 | 
				
			||||||
 | 
					[blog]: https://explosion.ai
 | 
				
			||||||
[project templates]: https://github.com/explosion/projects
 | 
					[project templates]: https://github.com/explosion/projects
 | 
				
			||||||
[changelog]: https://spacy.io/usage#changelog
 | 
					[changelog]: https://spacy.io/usage#changelog
 | 
				
			||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 | 
					[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
 | 
				
			||||||
 | 
					[swag]: https://explosion.ai/merch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 💬 Where to ask questions
 | 
					## 💬 Where to ask questions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
				
			||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
					LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
				
			||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
					OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
				
			||||||
SOFTWARE.
 | 
					SOFTWARE.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SciPy
 | 
				
			||||||
 | 
					-----
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* Files: scorer.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The implementation of trapezoid() is adapted from SciPy, which is distributed
 | 
				
			||||||
 | 
					under the following license:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					New BSD License
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
 | 
				
			||||||
 | 
					All rights reserved.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Redistribution and use in source and binary forms, with or without
 | 
				
			||||||
 | 
					modification, are permitted provided that the following conditions
 | 
				
			||||||
 | 
					are met:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. Redistributions of source code must retain the above copyright
 | 
				
			||||||
 | 
					   notice, this list of conditions and the following disclaimer.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. Redistributions in binary form must reproduce the above
 | 
				
			||||||
 | 
					   copyright notice, this list of conditions and the following
 | 
				
			||||||
 | 
					   disclaimer in the documentation and/or other materials provided
 | 
				
			||||||
 | 
					   with the distribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. Neither the name of the copyright holder nor the names of its
 | 
				
			||||||
 | 
					   contributors may be used to endorse or promote products derived
 | 
				
			||||||
 | 
					   from this software without specific prior written permission.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 | 
				
			||||||
 | 
					"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 | 
				
			||||||
 | 
					LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 | 
				
			||||||
 | 
					A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 | 
				
			||||||
 | 
					OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 | 
				
			||||||
 | 
					SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 | 
				
			||||||
 | 
					LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 | 
				
			||||||
 | 
					DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 | 
				
			||||||
 | 
					THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 | 
				
			||||||
 | 
					(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 | 
				
			||||||
 | 
					OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ requires = [
 | 
				
			||||||
    "cymem>=2.0.2,<2.1.0",
 | 
					    "cymem>=2.0.2,<2.1.0",
 | 
				
			||||||
    "preshed>=3.0.2,<3.1.0",
 | 
					    "preshed>=3.0.2,<3.1.0",
 | 
				
			||||||
    "murmurhash>=0.28.0,<1.1.0",
 | 
					    "murmurhash>=0.28.0,<1.1.0",
 | 
				
			||||||
    "thinc>=8.1.8,<8.3.0",
 | 
					    "thinc>=8.2.2,<8.3.0",
 | 
				
			||||||
    "numpy>=1.15.0; python_version < '3.9'",
 | 
					    "numpy>=1.15.0; python_version < '3.9'",
 | 
				
			||||||
    "numpy>=1.25.0; python_version >= '3.9'",
 | 
					    "numpy>=1.25.0; python_version >= '3.9'",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
 | 
				
			||||||
spacy-loggers>=1.0.0,<2.0.0
 | 
					spacy-loggers>=1.0.0,<2.0.0
 | 
				
			||||||
cymem>=2.0.2,<2.1.0
 | 
					cymem>=2.0.2,<2.1.0
 | 
				
			||||||
preshed>=3.0.2,<3.1.0
 | 
					preshed>=3.0.2,<3.1.0
 | 
				
			||||||
thinc>=8.1.8,<8.3.0
 | 
					thinc>=8.2.2,<8.3.0
 | 
				
			||||||
ml_datasets>=0.2.0,<0.3.0
 | 
					ml_datasets>=0.2.0,<0.3.0
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.9.1,<1.2.0
 | 
					wasabi>=0.9.1,<1.2.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -41,7 +41,7 @@ setup_requires =
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    thinc>=8.1.8,<8.3.0
 | 
					    thinc>=8.2.2,<8.3.0
 | 
				
			||||||
install_requires =
 | 
					install_requires =
 | 
				
			||||||
    # Our libraries
 | 
					    # Our libraries
 | 
				
			||||||
    spacy-legacy>=3.0.11,<3.1.0
 | 
					    spacy-legacy>=3.0.11,<3.1.0
 | 
				
			||||||
| 
						 | 
					@ -49,7 +49,7 @@ install_requires =
 | 
				
			||||||
    murmurhash>=0.28.0,<1.1.0
 | 
					    murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
    cymem>=2.0.2,<2.1.0
 | 
					    cymem>=2.0.2,<2.1.0
 | 
				
			||||||
    preshed>=3.0.2,<3.1.0
 | 
					    preshed>=3.0.2,<3.1.0
 | 
				
			||||||
    thinc>=8.1.8,<8.3.0
 | 
					    thinc>=8.2.2,<8.3.0
 | 
				
			||||||
    wasabi>=0.9.1,<1.2.0
 | 
					    wasabi>=0.9.1,<1.2.0
 | 
				
			||||||
    srsly>=2.4.3,<3.0.0
 | 
					    srsly>=2.4.3,<3.0.0
 | 
				
			||||||
    catalogue>=2.0.6,<2.1.0
 | 
					    catalogue>=2.0.6,<2.1.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "3.7.1"
 | 
					__version__ = "3.7.2"
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,8 +22,17 @@ from .init_pipeline import init_pipeline_cli  # noqa: F401
 | 
				
			||||||
from .package import package  # noqa: F401
 | 
					from .package import package  # noqa: F401
 | 
				
			||||||
from .pretrain import pretrain  # noqa: F401
 | 
					from .pretrain import pretrain  # noqa: F401
 | 
				
			||||||
from .profile import profile  # noqa: F401
 | 
					from .profile import profile  # noqa: F401
 | 
				
			||||||
from .train import train_cli  # noqa: F401
 | 
					from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					from .project.document import (  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					    project_document,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					from .validate import validate  # type: ignore[attr-defined]  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
					@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ from .. import util
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ..training import Corpus
 | 
					from ..training import Corpus
 | 
				
			||||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
 | 
					from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@benchmark_cli.command(
 | 
					@benchmark_cli.command(
 | 
				
			||||||
| 
						 | 
					@ -30,12 +30,14 @@ def benchmark_speed_cli(
 | 
				
			||||||
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
					    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
				
			||||||
    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
 | 
					    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
 | 
				
			||||||
    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
 | 
					    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
 | 
				
			||||||
 | 
					    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
 | 
					    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
 | 
				
			||||||
    data in the binary .spacy format.
 | 
					    data in the binary .spacy format.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    import_code(code_path)
 | 
				
			||||||
    setup_gpu(use_gpu=use_gpu, silent=False)
 | 
					    setup_gpu(use_gpu=use_gpu, silent=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    nlp = util.load_model(model)
 | 
					    nlp = util.load_model(model)
 | 
				
			||||||
| 
						 | 
					@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
 | 
				
			||||||
def warmup(
 | 
					def warmup(
 | 
				
			||||||
    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 | 
					    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 | 
				
			||||||
) -> numpy.ndarray:
 | 
					) -> numpy.ndarray:
 | 
				
			||||||
    docs = warmup_epochs * docs
 | 
					    docs = [doc.copy() for doc in docs * warmup_epochs]
 | 
				
			||||||
    return annotate(nlp, docs, batch_size)
 | 
					    return annotate(nlp, docs, batch_size)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,14 @@ from wasabi import msg
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import about
 | 
					from .. import about
 | 
				
			||||||
from ..errors import OLD_MODEL_SHORTCUTS
 | 
					from ..errors import OLD_MODEL_SHORTCUTS
 | 
				
			||||||
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
 | 
					from ..util import (
 | 
				
			||||||
 | 
					    get_minor_version,
 | 
				
			||||||
 | 
					    is_in_interactive,
 | 
				
			||||||
 | 
					    is_in_jupyter,
 | 
				
			||||||
 | 
					    is_package,
 | 
				
			||||||
 | 
					    is_prerelease_version,
 | 
				
			||||||
 | 
					    run_command,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 | 
					from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,6 +84,27 @@ def download(
 | 
				
			||||||
        "Download and installation successful",
 | 
					        "Download and installation successful",
 | 
				
			||||||
        f"You can now load the package via spacy.load('{model_name}')",
 | 
					        f"You can now load the package via spacy.load('{model_name}')",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					    if is_in_jupyter():
 | 
				
			||||||
 | 
					        reload_deps_msg = (
 | 
				
			||||||
 | 
					            "If you are in a Jupyter or Colab notebook, you may need to "
 | 
				
			||||||
 | 
					            "restart Python in order to load all the package's dependencies. "
 | 
				
			||||||
 | 
					            "You can do this by selecting the 'Restart kernel' or 'Restart "
 | 
				
			||||||
 | 
					            "runtime' option."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Restart to reload dependencies",
 | 
				
			||||||
 | 
					            reload_deps_msg,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    elif is_in_interactive():
 | 
				
			||||||
 | 
					        reload_deps_msg = (
 | 
				
			||||||
 | 
					            "If you are in an interactive Python session, you may need to "
 | 
				
			||||||
 | 
					            "exit and restart Python to load all the package's dependencies. "
 | 
				
			||||||
 | 
					            "You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Restart to reload dependencies",
 | 
				
			||||||
 | 
					            reload_deps_msg,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
 | 
					def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,7 @@
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
from collections import defaultdict
 | 
					from collections import defaultdict
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
| 
						 | 
					@ -11,6 +13,7 @@ from thinc.api import Config
 | 
				
			||||||
from wasabi import MarkdownRenderer, Printer, get_raw_input
 | 
					from wasabi import MarkdownRenderer, Printer, get_raw_input
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import about, util
 | 
					from .. import about, util
 | 
				
			||||||
 | 
					from ..compat import importlib_metadata
 | 
				
			||||||
from ..schemas import ModelMetaSchema, validate
 | 
					from ..schemas import ModelMetaSchema, validate
 | 
				
			||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
 | 
					from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,7 +38,7 @@ def package_cli(
 | 
				
			||||||
    specified output directory, and the data will be copied over. If
 | 
					    specified output directory, and the data will be copied over. If
 | 
				
			||||||
    --create-meta is set and a meta.json already exists in the output directory,
 | 
					    --create-meta is set and a meta.json already exists in the output directory,
 | 
				
			||||||
    the existing values will be used as the defaults in the command-line prompt.
 | 
					    the existing values will be used as the defaults in the command-line prompt.
 | 
				
			||||||
    After packaging, "python setup.py sdist" is run in the package directory,
 | 
					    After packaging, "python -m build --sdist" is run in the package directory,
 | 
				
			||||||
    which will create a .tar.gz archive that can be installed via "pip install".
 | 
					    which will create a .tar.gz archive that can be installed via "pip install".
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    If additional code files are provided (e.g. Python files containing custom
 | 
					    If additional code files are provided (e.g. Python files containing custom
 | 
				
			||||||
| 
						 | 
					@ -78,9 +81,17 @@ def package(
 | 
				
			||||||
    input_path = util.ensure_path(input_dir)
 | 
					    input_path = util.ensure_path(input_dir)
 | 
				
			||||||
    output_path = util.ensure_path(output_dir)
 | 
					    output_path = util.ensure_path(output_dir)
 | 
				
			||||||
    meta_path = util.ensure_path(meta_path)
 | 
					    meta_path = util.ensure_path(meta_path)
 | 
				
			||||||
    if create_wheel and not has_wheel():
 | 
					    if create_wheel and not has_wheel() and not has_build():
 | 
				
			||||||
        err = "Generating a binary .whl file requires wheel to be installed"
 | 
					        err = (
 | 
				
			||||||
        msg.fail(err, "pip install wheel", exits=1)
 | 
					            "Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        msg.fail(err, "pip install build", exits=1)
 | 
				
			||||||
 | 
					    if not has_build():
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Generating packages without the 'build' package is deprecated and "
 | 
				
			||||||
 | 
					            "will not be supported in the future. To install 'build': pip "
 | 
				
			||||||
 | 
					            "install build"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    if not input_path or not input_path.exists():
 | 
					    if not input_path or not input_path.exists():
 | 
				
			||||||
        msg.fail("Can't locate pipeline data", input_path, exits=1)
 | 
					        msg.fail("Can't locate pipeline data", input_path, exits=1)
 | 
				
			||||||
    if not output_path or not output_path.exists():
 | 
					    if not output_path or not output_path.exists():
 | 
				
			||||||
| 
						 | 
					@ -184,12 +195,37 @@ def package(
 | 
				
			||||||
    msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
 | 
					    msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
 | 
				
			||||||
    if create_sdist:
 | 
					    if create_sdist:
 | 
				
			||||||
        with util.working_dir(main_path):
 | 
					        with util.working_dir(main_path):
 | 
				
			||||||
 | 
					            # run directly, since util.run_command is not designed to continue
 | 
				
			||||||
 | 
					            # after a command fails
 | 
				
			||||||
 | 
					            ret = subprocess.run(
 | 
				
			||||||
 | 
					                [sys.executable, "-m", "build", ".", "--sdist"],
 | 
				
			||||||
 | 
					                env=os.environ.copy(),
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            if ret.returncode != 0:
 | 
				
			||||||
 | 
					                msg.warn(
 | 
				
			||||||
 | 
					                    "Creating sdist with 'python -m build' failed. Falling "
 | 
				
			||||||
 | 
					                    "back to deprecated use of 'python setup.py sdist'"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
                util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
 | 
					                util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
 | 
				
			||||||
        zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
 | 
					        zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
 | 
				
			||||||
        msg.good(f"Successfully created zipped Python package", zip_file)
 | 
					        msg.good(f"Successfully created zipped Python package", zip_file)
 | 
				
			||||||
    if create_wheel:
 | 
					    if create_wheel:
 | 
				
			||||||
        with util.working_dir(main_path):
 | 
					        with util.working_dir(main_path):
 | 
				
			||||||
            util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
 | 
					            # run directly, since util.run_command is not designed to continue
 | 
				
			||||||
 | 
					            # after a command fails
 | 
				
			||||||
 | 
					            ret = subprocess.run(
 | 
				
			||||||
 | 
					                [sys.executable, "-m", "build", ".", "--wheel"],
 | 
				
			||||||
 | 
					                env=os.environ.copy(),
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            if ret.returncode != 0:
 | 
				
			||||||
 | 
					                msg.warn(
 | 
				
			||||||
 | 
					                    "Creating wheel with 'python -m build' failed. Falling "
 | 
				
			||||||
 | 
					                    "back to deprecated use of 'wheel' with "
 | 
				
			||||||
 | 
					                    "'python setup.py bdist_wheel'"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                util.run_command(
 | 
				
			||||||
 | 
					                    [sys.executable, "setup.py", "bdist_wheel"], capture=False
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
        wheel_name_squashed = re.sub("_+", "_", model_name_v)
 | 
					        wheel_name_squashed = re.sub("_+", "_", model_name_v)
 | 
				
			||||||
        wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
 | 
					        wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
 | 
				
			||||||
        msg.good(f"Successfully created binary wheel", wheel)
 | 
					        msg.good(f"Successfully created binary wheel", wheel)
 | 
				
			||||||
| 
						 | 
					@ -209,6 +245,17 @@ def has_wheel() -> bool:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def has_build() -> bool:
 | 
				
			||||||
 | 
					    # it's very likely that there is a local directory named build/ (especially
 | 
				
			||||||
 | 
					    # in an editable install), so an import check is not sufficient; instead
 | 
				
			||||||
 | 
					    # check that there is a package version
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        importlib_metadata.version("build")
 | 
				
			||||||
 | 
					        return True
 | 
				
			||||||
 | 
					    except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined]
 | 
				
			||||||
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_third_party_dependencies(
 | 
					def get_third_party_dependencies(
 | 
				
			||||||
    config: Config, exclude: List[str] = util.SimpleFrozenList()
 | 
					    config: Config, exclude: List[str] = util.SimpleFrozenList()
 | 
				
			||||||
) -> List[str]:
 | 
					) -> List[str]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/cli/project/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										1
									
								
								spacy/cli/project/assets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.assets import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/clone.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.clone import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/document.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.document import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/dvc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.dvc import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/pull.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.pull import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/push.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.push import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/remote_storage.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.remote_storage import *
 | 
				
			||||||
							
								
								
									
										1
									
								
								spacy/cli/project/run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					from weasel.cli.run import *
 | 
				
			||||||
| 
						 | 
					@ -271,8 +271,9 @@ grad_factor = 1.0
 | 
				
			||||||
@layers = "reduce_mean.v1"
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.textcat.model.linear_model]
 | 
					[components.textcat.model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -308,8 +309,9 @@ grad_factor = 1.0
 | 
				
			||||||
@layers = "reduce_mean.v1"
 | 
					@layers = "reduce_mean.v1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.textcat_multilabel.model.linear_model]
 | 
					[components.textcat_multilabel.model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -542,14 +544,15 @@ nO = null
 | 
				
			||||||
width = ${components.tok2vec.model.encode.width}
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.textcat.model.linear_model]
 | 
					[components.textcat.model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% else -%}
 | 
					{% else -%}
 | 
				
			||||||
[components.textcat.model]
 | 
					[components.textcat.model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
| 
						 | 
					@ -570,15 +573,17 @@ nO = null
 | 
				
			||||||
width = ${components.tok2vec.model.encode.width}
 | 
					width = ${components.tok2vec.model.encode.width}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.textcat_multilabel.model.linear_model]
 | 
					[components.textcat_multilabel.model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{% else -%}
 | 
					{% else -%}
 | 
				
			||||||
[components.textcat_multilabel.model]
 | 
					[components.textcat_multilabel.model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
{%- endif %}
 | 
					{%- endif %}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -142,7 +142,25 @@ class SpanRenderer:
 | 
				
			||||||
        spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
 | 
					        spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
 | 
				
			||||||
        title (str / None): Document title set in Doc.user_data['title'].
 | 
					        title (str / None): Document title set in Doc.user_data['title'].
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        per_token_info = []
 | 
					        per_token_info = self._assemble_per_token_info(tokens, spans)
 | 
				
			||||||
 | 
					        markup = self._render_markup(per_token_info)
 | 
				
			||||||
 | 
					        markup = TPL_SPANS.format(content=markup, dir=self.direction)
 | 
				
			||||||
 | 
					        if title:
 | 
				
			||||||
 | 
					            markup = TPL_TITLE.format(title=title) + markup
 | 
				
			||||||
 | 
					        return markup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def _assemble_per_token_info(
 | 
				
			||||||
 | 
					        tokens: List[str], spans: List[Dict[str, Any]]
 | 
				
			||||||
 | 
					    ) -> List[Dict[str, List[Dict[str, Any]]]]:
 | 
				
			||||||
 | 
					        """Assembles token info used to generate markup in render_spans().
 | 
				
			||||||
 | 
					        tokens (List[str]): Tokens in text.
 | 
				
			||||||
 | 
					        spans (List[Dict[str, Any]]): Spans in text.
 | 
				
			||||||
 | 
					        RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
 | 
				
			||||||
 | 
					            and spans.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # we must sort so that we can correctly describe when spans need to "stack"
 | 
					        # we must sort so that we can correctly describe when spans need to "stack"
 | 
				
			||||||
        # which is determined by their start token, then span length (longer spans on top),
 | 
					        # which is determined by their start token, then span length (longer spans on top),
 | 
				
			||||||
        # then break any remaining ties with the span label
 | 
					        # then break any remaining ties with the span label
 | 
				
			||||||
| 
						 | 
					@ -154,21 +172,22 @@ class SpanRenderer:
 | 
				
			||||||
                s["label"],
 | 
					                s["label"],
 | 
				
			||||||
            ),
 | 
					            ),
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for s in spans:
 | 
					        for s in spans:
 | 
				
			||||||
            # this is the vertical 'slot' that the span will be rendered in
 | 
					            # this is the vertical 'slot' that the span will be rendered in
 | 
				
			||||||
            # vertical_position = span_label_offset + (offset_step * (slot - 1))
 | 
					            # vertical_position = span_label_offset + (offset_step * (slot - 1))
 | 
				
			||||||
            s["render_slot"] = 0
 | 
					            s["render_slot"] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for idx, token in enumerate(tokens):
 | 
					        for idx, token in enumerate(tokens):
 | 
				
			||||||
            # Identify if a token belongs to a Span (and which) and if it's a
 | 
					            # Identify if a token belongs to a Span (and which) and if it's a
 | 
				
			||||||
            # start token of said Span. We'll use this for the final HTML render
 | 
					            # start token of said Span. We'll use this for the final HTML render
 | 
				
			||||||
            token_markup: Dict[str, Any] = {}
 | 
					            token_markup: Dict[str, Any] = {}
 | 
				
			||||||
            token_markup["text"] = token
 | 
					            token_markup["text"] = token
 | 
				
			||||||
            concurrent_spans = 0
 | 
					            intersecting_spans: List[Dict[str, Any]] = []
 | 
				
			||||||
            entities = []
 | 
					            entities = []
 | 
				
			||||||
            for span in spans:
 | 
					            for span in spans:
 | 
				
			||||||
                ent = {}
 | 
					                ent = {}
 | 
				
			||||||
                if span["start_token"] <= idx < span["end_token"]:
 | 
					                if span["start_token"] <= idx < span["end_token"]:
 | 
				
			||||||
                    concurrent_spans += 1
 | 
					 | 
				
			||||||
                    span_start = idx == span["start_token"]
 | 
					                    span_start = idx == span["start_token"]
 | 
				
			||||||
                    ent["label"] = span["label"]
 | 
					                    ent["label"] = span["label"]
 | 
				
			||||||
                    ent["is_start"] = span_start
 | 
					                    ent["is_start"] = span_start
 | 
				
			||||||
| 
						 | 
					@ -176,7 +195,12 @@ class SpanRenderer:
 | 
				
			||||||
                        # When the span starts, we need to know how many other
 | 
					                        # When the span starts, we need to know how many other
 | 
				
			||||||
                        # spans are on the 'span stack' and will be rendered.
 | 
					                        # spans are on the 'span stack' and will be rendered.
 | 
				
			||||||
                        # This value becomes the vertical render slot for this entire span
 | 
					                        # This value becomes the vertical render slot for this entire span
 | 
				
			||||||
                        span["render_slot"] = concurrent_spans
 | 
					                        span["render_slot"] = (
 | 
				
			||||||
 | 
					                            intersecting_spans[-1]["render_slot"]
 | 
				
			||||||
 | 
					                            if len(intersecting_spans)
 | 
				
			||||||
 | 
					                            else 0
 | 
				
			||||||
 | 
					                        ) + 1
 | 
				
			||||||
 | 
					                    intersecting_spans.append(span)
 | 
				
			||||||
                    ent["render_slot"] = span["render_slot"]
 | 
					                    ent["render_slot"] = span["render_slot"]
 | 
				
			||||||
                    kb_id = span.get("kb_id", "")
 | 
					                    kb_id = span.get("kb_id", "")
 | 
				
			||||||
                    kb_url = span.get("kb_url", "#")
 | 
					                    kb_url = span.get("kb_url", "#")
 | 
				
			||||||
| 
						 | 
					@ -193,11 +217,8 @@ class SpanRenderer:
 | 
				
			||||||
                    span["render_slot"] = 0
 | 
					                    span["render_slot"] = 0
 | 
				
			||||||
            token_markup["entities"] = entities
 | 
					            token_markup["entities"] = entities
 | 
				
			||||||
            per_token_info.append(token_markup)
 | 
					            per_token_info.append(token_markup)
 | 
				
			||||||
        markup = self._render_markup(per_token_info)
 | 
					
 | 
				
			||||||
        markup = TPL_SPANS.format(content=markup, dir=self.direction)
 | 
					        return per_token_info
 | 
				
			||||||
        if title:
 | 
					 | 
				
			||||||
            markup = TPL_TITLE.format(title=title) + markup
 | 
					 | 
				
			||||||
        return markup
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
 | 
					    def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
 | 
				
			||||||
        """Render the markup from per-token information"""
 | 
					        """Render the markup from per-token information"""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -227,7 +227,6 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
 | 
					    E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
 | 
				
			||||||
            "This usually happens when spaCy calls `nlp.{method}` with a custom "
 | 
					            "This usually happens when spaCy calls `nlp.{method}` with a custom "
 | 
				
			||||||
            "component name that's not registered on the current language class. "
 | 
					            "component name that's not registered on the current language class. "
 | 
				
			||||||
            "If you're using a Transformer, make sure to install 'spacy-transformers'. "
 | 
					 | 
				
			||||||
            "If you're using a custom component, make sure you've added the "
 | 
					            "If you're using a custom component, make sure you've added the "
 | 
				
			||||||
            "decorator `@Language.component` (for function components) or "
 | 
					            "decorator `@Language.component` (for function components) or "
 | 
				
			||||||
            "`@Language.factory` (for class components).\n\nAvailable "
 | 
					            "`@Language.factory` (for class components).\n\nAvailable "
 | 
				
			||||||
| 
						 | 
					@ -984,6 +983,10 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
             "predicted docs when training {component}.")
 | 
					             "predicted docs when training {component}.")
 | 
				
			||||||
    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
 | 
					    E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
 | 
				
			||||||
             "but only callbacks with one or three parameters are supported")
 | 
					             "but only callbacks with one or three parameters are supported")
 | 
				
			||||||
 | 
					    E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
 | 
				
			||||||
 | 
					    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
 | 
				
			||||||
 | 
					             "reduction. Please enable one of `use_reduce_first`, "
 | 
				
			||||||
 | 
					             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Deprecated model shortcuts, only used in errors and warnings
 | 
					# Deprecated model shortcuts, only used in errors and warnings
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,11 @@
 | 
				
			||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
 | 
					from .candidate import Candidate, get_candidates, get_candidates_batch
 | 
				
			||||||
from .kb import KnowledgeBase
 | 
					from .kb import KnowledgeBase
 | 
				
			||||||
from .kb_in_memory import InMemoryLookupKB
 | 
					from .kb_in_memory import InMemoryLookupKB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__all__ = [
 | 
				
			||||||
 | 
					    "Candidate",
 | 
				
			||||||
 | 
					    "KnowledgeBase",
 | 
				
			||||||
 | 
					    "InMemoryLookupKB",
 | 
				
			||||||
 | 
					    "get_candidates",
 | 
				
			||||||
 | 
					    "get_candidates_batch",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,8 @@ _num_words = [
 | 
				
			||||||
    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
 | 
					    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
 | 
				
			||||||
    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
 | 
					    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
 | 
				
			||||||
    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
 | 
					    "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
 | 
				
			||||||
    "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
 | 
					    "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion",
 | 
				
			||||||
 | 
					    "septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion"
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
_ordinal_words = [
 | 
					_ordinal_words = [
 | 
				
			||||||
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
 | 
					    "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
 | 
				
			||||||
| 
						 | 
					@ -14,7 +15,8 @@ _ordinal_words = [
 | 
				
			||||||
    "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
 | 
					    "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
 | 
				
			||||||
    "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
 | 
					    "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
 | 
				
			||||||
    "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
 | 
					    "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
 | 
				
			||||||
    "trillionth", "quadrillionth", "gajillionth", "bazillionth"
 | 
					    "trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth",
 | 
				
			||||||
 | 
					    "octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth"
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										18
									
								
								spacy/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,18 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class FaroeseDefaults(BaseDefaults):
 | 
				
			||||||
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Faroese(Language):
 | 
				
			||||||
 | 
					    lang = "fo"
 | 
				
			||||||
 | 
					    Defaults = FaroeseDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__all__ = ["Faroese"]
 | 
				
			||||||
							
								
								
									
										90
									
								
								spacy/lang/fo/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,90 @@
 | 
				
			||||||
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for orth in [
 | 
				
			||||||
 | 
					    "apr.",
 | 
				
			||||||
 | 
					    "aug.",
 | 
				
			||||||
 | 
					    "avgr.",
 | 
				
			||||||
 | 
					    "árg.",
 | 
				
			||||||
 | 
					    "ávís.",
 | 
				
			||||||
 | 
					    "beinl.",
 | 
				
			||||||
 | 
					    "blkv.",
 | 
				
			||||||
 | 
					    "blaðkv.",
 | 
				
			||||||
 | 
					    "blm.",
 | 
				
			||||||
 | 
					    "blaðm.",
 | 
				
			||||||
 | 
					    "bls.",
 | 
				
			||||||
 | 
					    "blstj.",
 | 
				
			||||||
 | 
					    "blaðstj.",
 | 
				
			||||||
 | 
					    "des.",
 | 
				
			||||||
 | 
					    "eint.",
 | 
				
			||||||
 | 
					    "febr.",
 | 
				
			||||||
 | 
					    "fyrrv.",
 | 
				
			||||||
 | 
					    "góðk.",
 | 
				
			||||||
 | 
					    "h.m.",
 | 
				
			||||||
 | 
					    "innt.",
 | 
				
			||||||
 | 
					    "jan.",
 | 
				
			||||||
 | 
					    "kl.",
 | 
				
			||||||
 | 
					    "m.a.",
 | 
				
			||||||
 | 
					    "mðr.",
 | 
				
			||||||
 | 
					    "mió.",
 | 
				
			||||||
 | 
					    "nr.",
 | 
				
			||||||
 | 
					    "nto.",
 | 
				
			||||||
 | 
					    "nov.",
 | 
				
			||||||
 | 
					    "nút.",
 | 
				
			||||||
 | 
					    "o.a.",
 | 
				
			||||||
 | 
					    "o.a.m.",
 | 
				
			||||||
 | 
					    "o.a.tíl.",
 | 
				
			||||||
 | 
					    "o.fl.",
 | 
				
			||||||
 | 
					    "ff.",
 | 
				
			||||||
 | 
					    "o.m.a.",
 | 
				
			||||||
 | 
					    "o.o.",
 | 
				
			||||||
 | 
					    "o.s.fr.",
 | 
				
			||||||
 | 
					    "o.tíl.",
 | 
				
			||||||
 | 
					    "o.ø.",
 | 
				
			||||||
 | 
					    "okt.",
 | 
				
			||||||
 | 
					    "omf.",
 | 
				
			||||||
 | 
					    "pst.",
 | 
				
			||||||
 | 
					    "ritstj.",
 | 
				
			||||||
 | 
					    "sbr.",
 | 
				
			||||||
 | 
					    "sms.",
 | 
				
			||||||
 | 
					    "smst.",
 | 
				
			||||||
 | 
					    "smb.",
 | 
				
			||||||
 | 
					    "sb.",
 | 
				
			||||||
 | 
					    "sbrt.",
 | 
				
			||||||
 | 
					    "sp.",
 | 
				
			||||||
 | 
					    "sept.",
 | 
				
			||||||
 | 
					    "spf.",
 | 
				
			||||||
 | 
					    "spsk.",
 | 
				
			||||||
 | 
					    "t.e.",
 | 
				
			||||||
 | 
					    "t.s.",
 | 
				
			||||||
 | 
					    "t.s.s.",
 | 
				
			||||||
 | 
					    "tlf.",
 | 
				
			||||||
 | 
					    "tel.",
 | 
				
			||||||
 | 
					    "tsk.",
 | 
				
			||||||
 | 
					    "t.o.v.",
 | 
				
			||||||
 | 
					    "t.d.",
 | 
				
			||||||
 | 
					    "uml.",
 | 
				
			||||||
 | 
					    "ums.",
 | 
				
			||||||
 | 
					    "uppl.",
 | 
				
			||||||
 | 
					    "upprfr.",
 | 
				
			||||||
 | 
					    "uppr.",
 | 
				
			||||||
 | 
					    "útg.",
 | 
				
			||||||
 | 
					    "útl.",
 | 
				
			||||||
 | 
					    "útr.",
 | 
				
			||||||
 | 
					    "vanl.",
 | 
				
			||||||
 | 
					    "v.",
 | 
				
			||||||
 | 
					    "v.h.",
 | 
				
			||||||
 | 
					    "v.ø.o.",
 | 
				
			||||||
 | 
					    "viðm.",
 | 
				
			||||||
 | 
					    "viðv.",
 | 
				
			||||||
 | 
					    "vm.",
 | 
				
			||||||
 | 
					    "v.m.",
 | 
				
			||||||
 | 
					]:
 | 
				
			||||||
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					    capitalized = orth.capitalize()
 | 
				
			||||||
 | 
					    _exc[capitalized] = [{ORTH: capitalized}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
							
								
								
									
										20
									
								
								spacy/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,20 @@
 | 
				
			||||||
 | 
					from ...language import BaseDefaults, Language
 | 
				
			||||||
 | 
					from ..nb import SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class NorwegianNynorskDefaults(BaseDefaults):
 | 
				
			||||||
 | 
					    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
 | 
				
			||||||
 | 
					    prefixes = TOKENIZER_PREFIXES
 | 
				
			||||||
 | 
					    infixes = TOKENIZER_INFIXES
 | 
				
			||||||
 | 
					    suffixes = TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class NorwegianNynorsk(Language):
 | 
				
			||||||
 | 
					    lang = "nn"
 | 
				
			||||||
 | 
					    Defaults = NorwegianNynorskDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__all__ = ["NorwegianNynorsk"]
 | 
				
			||||||
							
								
								
									
										15
									
								
								spacy/lang/nn/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,15 @@
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					>>> from spacy.lang.nn.examples import sentences
 | 
				
			||||||
 | 
					>>> docs = nlp.pipe(sentences)
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
 | 
				
			||||||
 | 
					sentences = [
 | 
				
			||||||
 | 
					    "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
 | 
				
			||||||
 | 
					    "Det er ein meir enn i same periode i fjor.",
 | 
				
			||||||
 | 
					    "Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
 | 
				
			||||||
 | 
					    "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
							
								
								
									
										74
									
								
								spacy/lang/nn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,74 @@
 | 
				
			||||||
 | 
					from ..char_classes import (
 | 
				
			||||||
 | 
					    ALPHA,
 | 
				
			||||||
 | 
					    ALPHA_LOWER,
 | 
				
			||||||
 | 
					    ALPHA_UPPER,
 | 
				
			||||||
 | 
					    CONCAT_QUOTES,
 | 
				
			||||||
 | 
					    CURRENCY,
 | 
				
			||||||
 | 
					    LIST_CURRENCY,
 | 
				
			||||||
 | 
					    LIST_ELLIPSES,
 | 
				
			||||||
 | 
					    LIST_ICONS,
 | 
				
			||||||
 | 
					    LIST_PUNCT,
 | 
				
			||||||
 | 
					    LIST_QUOTES,
 | 
				
			||||||
 | 
					    PUNCT,
 | 
				
			||||||
 | 
					    UNITS,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from ..punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
 | 
					_list_punct = [x for x in LIST_PUNCT if x != "#"]
 | 
				
			||||||
 | 
					_list_icons = [x for x in LIST_ICONS if x != "°"]
 | 
				
			||||||
 | 
					_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
 | 
				
			||||||
 | 
					_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_prefixes = (
 | 
				
			||||||
 | 
					    ["§", "%", "=", "—", "–", r"\+(?![0-9])"]
 | 
				
			||||||
 | 
					    + _list_punct
 | 
				
			||||||
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
 | 
					    + LIST_QUOTES
 | 
				
			||||||
 | 
					    + LIST_CURRENCY
 | 
				
			||||||
 | 
					    + LIST_ICONS
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_infixes = (
 | 
				
			||||||
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
 | 
					    + _list_icons
 | 
				
			||||||
 | 
					    + [
 | 
				
			||||||
 | 
					        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_suffixes = (
 | 
				
			||||||
 | 
					    LIST_PUNCT
 | 
				
			||||||
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
 | 
					    + _list_quotes
 | 
				
			||||||
 | 
					    + _list_icons
 | 
				
			||||||
 | 
					    + ["—", "–"]
 | 
				
			||||||
 | 
					    + [
 | 
				
			||||||
 | 
					        r"(?<=[0-9])\+",
 | 
				
			||||||
 | 
					        r"(?<=°[FfCcKk])\.",
 | 
				
			||||||
 | 
					        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
 | 
				
			||||||
 | 
					        r"(?<=[0-9])(?:{u})".format(u=UNITS),
 | 
				
			||||||
 | 
					        r"(?<=[{al}{e}{p}(?:{q})])\.".format(
 | 
				
			||||||
 | 
					            al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    + [r"(?<=[^sSxXzZ])'"]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					_suffixes += [
 | 
				
			||||||
 | 
					    suffix
 | 
				
			||||||
 | 
					    for suffix in TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_PREFIXES = _prefixes
 | 
				
			||||||
 | 
					TOKENIZER_INFIXES = _infixes
 | 
				
			||||||
 | 
					TOKENIZER_SUFFIXES = _suffixes
 | 
				
			||||||
							
								
								
									
										228
									
								
								spacy/lang/nn/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,228 @@
 | 
				
			||||||
 | 
					from ...symbols import NORM, ORTH
 | 
				
			||||||
 | 
					from ...util import update_exc
 | 
				
			||||||
 | 
					from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_exc = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for exc_data in [
 | 
				
			||||||
 | 
					    {ORTH: "jan.", NORM: "januar"},
 | 
				
			||||||
 | 
					    {ORTH: "feb.", NORM: "februar"},
 | 
				
			||||||
 | 
					    {ORTH: "mar.", NORM: "mars"},
 | 
				
			||||||
 | 
					    {ORTH: "apr.", NORM: "april"},
 | 
				
			||||||
 | 
					    {ORTH: "jun.", NORM: "juni"},
 | 
				
			||||||
 | 
					    # note: "jul." is in the simple list below without a NORM exception
 | 
				
			||||||
 | 
					    {ORTH: "aug.", NORM: "august"},
 | 
				
			||||||
 | 
					    {ORTH: "sep.", NORM: "september"},
 | 
				
			||||||
 | 
					    {ORTH: "okt.", NORM: "oktober"},
 | 
				
			||||||
 | 
					    {ORTH: "nov.", NORM: "november"},
 | 
				
			||||||
 | 
					    {ORTH: "des.", NORM: "desember"},
 | 
				
			||||||
 | 
					]:
 | 
				
			||||||
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for orth in [
 | 
				
			||||||
 | 
					    "Ap.",
 | 
				
			||||||
 | 
					    "Aq.",
 | 
				
			||||||
 | 
					    "Ca.",
 | 
				
			||||||
 | 
					    "Chr.",
 | 
				
			||||||
 | 
					    "Co.",
 | 
				
			||||||
 | 
					    "Dr.",
 | 
				
			||||||
 | 
					    "F.eks.",
 | 
				
			||||||
 | 
					    "Fr.p.",
 | 
				
			||||||
 | 
					    "Frp.",
 | 
				
			||||||
 | 
					    "Grl.",
 | 
				
			||||||
 | 
					    "Kr.",
 | 
				
			||||||
 | 
					    "Kr.F.",
 | 
				
			||||||
 | 
					    "Kr.F.s",
 | 
				
			||||||
 | 
					    "Mr.",
 | 
				
			||||||
 | 
					    "Mrs.",
 | 
				
			||||||
 | 
					    "Pb.",
 | 
				
			||||||
 | 
					    "Pr.",
 | 
				
			||||||
 | 
					    "Sp.",
 | 
				
			||||||
 | 
					    "St.",
 | 
				
			||||||
 | 
					    "a.m.",
 | 
				
			||||||
 | 
					    "ad.",
 | 
				
			||||||
 | 
					    "adm.dir.",
 | 
				
			||||||
 | 
					    "adr.",
 | 
				
			||||||
 | 
					    "b.c.",
 | 
				
			||||||
 | 
					    "bl.a.",
 | 
				
			||||||
 | 
					    "bla.",
 | 
				
			||||||
 | 
					    "bm.",
 | 
				
			||||||
 | 
					    "bnr.",
 | 
				
			||||||
 | 
					    "bto.",
 | 
				
			||||||
 | 
					    "c.c.",
 | 
				
			||||||
 | 
					    "ca.",
 | 
				
			||||||
 | 
					    "cand.mag.",
 | 
				
			||||||
 | 
					    "co.",
 | 
				
			||||||
 | 
					    "d.d.",
 | 
				
			||||||
 | 
					    "d.m.",
 | 
				
			||||||
 | 
					    "d.y.",
 | 
				
			||||||
 | 
					    "dept.",
 | 
				
			||||||
 | 
					    "dr.",
 | 
				
			||||||
 | 
					    "dr.med.",
 | 
				
			||||||
 | 
					    "dr.philos.",
 | 
				
			||||||
 | 
					    "dr.psychol.",
 | 
				
			||||||
 | 
					    "dss.",
 | 
				
			||||||
 | 
					    "dvs.",
 | 
				
			||||||
 | 
					    "e.Kr.",
 | 
				
			||||||
 | 
					    "e.l.",
 | 
				
			||||||
 | 
					    "eg.",
 | 
				
			||||||
 | 
					    "eig.",
 | 
				
			||||||
 | 
					    "ekskl.",
 | 
				
			||||||
 | 
					    "el.",
 | 
				
			||||||
 | 
					    "et.",
 | 
				
			||||||
 | 
					    "etc.",
 | 
				
			||||||
 | 
					    "etg.",
 | 
				
			||||||
 | 
					    "ev.",
 | 
				
			||||||
 | 
					    "evt.",
 | 
				
			||||||
 | 
					    "f.",
 | 
				
			||||||
 | 
					    "f.Kr.",
 | 
				
			||||||
 | 
					    "f.eks.",
 | 
				
			||||||
 | 
					    "f.o.m.",
 | 
				
			||||||
 | 
					    "fhv.",
 | 
				
			||||||
 | 
					    "fk.",
 | 
				
			||||||
 | 
					    "foreg.",
 | 
				
			||||||
 | 
					    "fork.",
 | 
				
			||||||
 | 
					    "fv.",
 | 
				
			||||||
 | 
					    "fvt.",
 | 
				
			||||||
 | 
					    "g.",
 | 
				
			||||||
 | 
					    "gl.",
 | 
				
			||||||
 | 
					    "gno.",
 | 
				
			||||||
 | 
					    "gnr.",
 | 
				
			||||||
 | 
					    "grl.",
 | 
				
			||||||
 | 
					    "gt.",
 | 
				
			||||||
 | 
					    "h.r.adv.",
 | 
				
			||||||
 | 
					    "hhv.",
 | 
				
			||||||
 | 
					    "hoh.",
 | 
				
			||||||
 | 
					    "hr.",
 | 
				
			||||||
 | 
					    "ifb.",
 | 
				
			||||||
 | 
					    "ifm.",
 | 
				
			||||||
 | 
					    "iht.",
 | 
				
			||||||
 | 
					    "inkl.",
 | 
				
			||||||
 | 
					    "istf.",
 | 
				
			||||||
 | 
					    "jf.",
 | 
				
			||||||
 | 
					    "jr.",
 | 
				
			||||||
 | 
					    "jul.",
 | 
				
			||||||
 | 
					    "juris.",
 | 
				
			||||||
 | 
					    "kfr.",
 | 
				
			||||||
 | 
					    "kgl.",
 | 
				
			||||||
 | 
					    "kgl.res.",
 | 
				
			||||||
 | 
					    "kl.",
 | 
				
			||||||
 | 
					    "komm.",
 | 
				
			||||||
 | 
					    "kr.",
 | 
				
			||||||
 | 
					    "kst.",
 | 
				
			||||||
 | 
					    "lat.",
 | 
				
			||||||
 | 
					    "lø.",
 | 
				
			||||||
 | 
					    "m.a.",
 | 
				
			||||||
 | 
					    "m.a.o.",
 | 
				
			||||||
 | 
					    "m.fl.",
 | 
				
			||||||
 | 
					    "m.m.",
 | 
				
			||||||
 | 
					    "m.v.",
 | 
				
			||||||
 | 
					    "ma.",
 | 
				
			||||||
 | 
					    "mag.art.",
 | 
				
			||||||
 | 
					    "md.",
 | 
				
			||||||
 | 
					    "mfl.",
 | 
				
			||||||
 | 
					    "mht.",
 | 
				
			||||||
 | 
					    "mill.",
 | 
				
			||||||
 | 
					    "min.",
 | 
				
			||||||
 | 
					    "mnd.",
 | 
				
			||||||
 | 
					    "moh.",
 | 
				
			||||||
 | 
					    "mrd.",
 | 
				
			||||||
 | 
					    "muh.",
 | 
				
			||||||
 | 
					    "mv.",
 | 
				
			||||||
 | 
					    "mva.",
 | 
				
			||||||
 | 
					    "n.å.",
 | 
				
			||||||
 | 
					    "ndf.",
 | 
				
			||||||
 | 
					    "nr.",
 | 
				
			||||||
 | 
					    "nto.",
 | 
				
			||||||
 | 
					    "nyno.",
 | 
				
			||||||
 | 
					    "o.a.",
 | 
				
			||||||
 | 
					    "o.l.",
 | 
				
			||||||
 | 
					    "obl.",
 | 
				
			||||||
 | 
					    "off.",
 | 
				
			||||||
 | 
					    "ofl.",
 | 
				
			||||||
 | 
					    "on.",
 | 
				
			||||||
 | 
					    "op.",
 | 
				
			||||||
 | 
					    "org.",
 | 
				
			||||||
 | 
					    "osv.",
 | 
				
			||||||
 | 
					    "ovf.",
 | 
				
			||||||
 | 
					    "p.",
 | 
				
			||||||
 | 
					    "p.a.",
 | 
				
			||||||
 | 
					    "p.g.a.",
 | 
				
			||||||
 | 
					    "p.m.",
 | 
				
			||||||
 | 
					    "p.t.",
 | 
				
			||||||
 | 
					    "pga.",
 | 
				
			||||||
 | 
					    "ph.d.",
 | 
				
			||||||
 | 
					    "pkt.",
 | 
				
			||||||
 | 
					    "pr.",
 | 
				
			||||||
 | 
					    "pst.",
 | 
				
			||||||
 | 
					    "pt.",
 | 
				
			||||||
 | 
					    "red.anm.",
 | 
				
			||||||
 | 
					    "ref.",
 | 
				
			||||||
 | 
					    "res.",
 | 
				
			||||||
 | 
					    "res.kap.",
 | 
				
			||||||
 | 
					    "resp.",
 | 
				
			||||||
 | 
					    "rv.",
 | 
				
			||||||
 | 
					    "s.",
 | 
				
			||||||
 | 
					    "s.d.",
 | 
				
			||||||
 | 
					    "s.k.",
 | 
				
			||||||
 | 
					    "s.u.",
 | 
				
			||||||
 | 
					    "s.å.",
 | 
				
			||||||
 | 
					    "sen.",
 | 
				
			||||||
 | 
					    "sep.",
 | 
				
			||||||
 | 
					    "siviling.",
 | 
				
			||||||
 | 
					    "sms.",
 | 
				
			||||||
 | 
					    "snr.",
 | 
				
			||||||
 | 
					    "spm.",
 | 
				
			||||||
 | 
					    "sr.",
 | 
				
			||||||
 | 
					    "sst.",
 | 
				
			||||||
 | 
					    "st.",
 | 
				
			||||||
 | 
					    "st.meld.",
 | 
				
			||||||
 | 
					    "st.prp.",
 | 
				
			||||||
 | 
					    "stip.",
 | 
				
			||||||
 | 
					    "stk.",
 | 
				
			||||||
 | 
					    "stud.",
 | 
				
			||||||
 | 
					    "sv.",
 | 
				
			||||||
 | 
					    "såk.",
 | 
				
			||||||
 | 
					    "sø.",
 | 
				
			||||||
 | 
					    "t.d.",
 | 
				
			||||||
 | 
					    "t.h.",
 | 
				
			||||||
 | 
					    "t.o.m.",
 | 
				
			||||||
 | 
					    "t.v.",
 | 
				
			||||||
 | 
					    "temp.",
 | 
				
			||||||
 | 
					    "ti.",
 | 
				
			||||||
 | 
					    "tils.",
 | 
				
			||||||
 | 
					    "tilsv.",
 | 
				
			||||||
 | 
					    "tl;dr",
 | 
				
			||||||
 | 
					    "tlf.",
 | 
				
			||||||
 | 
					    "to.",
 | 
				
			||||||
 | 
					    "ult.",
 | 
				
			||||||
 | 
					    "utg.",
 | 
				
			||||||
 | 
					    "v.",
 | 
				
			||||||
 | 
					    "vedk.",
 | 
				
			||||||
 | 
					    "vedr.",
 | 
				
			||||||
 | 
					    "vg.",
 | 
				
			||||||
 | 
					    "vgs.",
 | 
				
			||||||
 | 
					    "vha.",
 | 
				
			||||||
 | 
					    "vit.ass.",
 | 
				
			||||||
 | 
					    "vn.",
 | 
				
			||||||
 | 
					    "vol.",
 | 
				
			||||||
 | 
					    "vs.",
 | 
				
			||||||
 | 
					    "vsa.",
 | 
				
			||||||
 | 
					    "§§",
 | 
				
			||||||
 | 
					    "©NTB",
 | 
				
			||||||
 | 
					    "årg.",
 | 
				
			||||||
 | 
					    "årh.",
 | 
				
			||||||
 | 
					]:
 | 
				
			||||||
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Dates
 | 
				
			||||||
 | 
					for h in range(1, 31 + 1):
 | 
				
			||||||
 | 
					    for period in ["."]:
 | 
				
			||||||
 | 
					        _exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
 | 
				
			||||||
 | 
					_exc.update(_custom_base_exc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 | 
				
			||||||
| 
						 | 
					@ -1683,6 +1683,12 @@ class Language:
 | 
				
			||||||
        for proc in procs:
 | 
					        for proc in procs:
 | 
				
			||||||
            proc.start()
 | 
					            proc.start()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Close writing-end of channels. This is needed to avoid that reading
 | 
				
			||||||
 | 
					        # from the channel blocks indefinitely when the worker closes the
 | 
				
			||||||
 | 
					        # channel.
 | 
				
			||||||
 | 
					        for tx in bytedocs_send_ch:
 | 
				
			||||||
 | 
					            tx.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Cycle channels not to break the order of docs.
 | 
					        # Cycle channels not to break the order of docs.
 | 
				
			||||||
        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
 | 
					        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
 | 
				
			||||||
        byte_tuples = chain.from_iterable(
 | 
					        byte_tuples = chain.from_iterable(
 | 
				
			||||||
| 
						 | 
					@ -1705,8 +1711,23 @@ class Language:
 | 
				
			||||||
                    # tell `sender` that one batch was consumed.
 | 
					                    # tell `sender` that one batch was consumed.
 | 
				
			||||||
                    sender.step()
 | 
					                    sender.step()
 | 
				
			||||||
        finally:
 | 
					        finally:
 | 
				
			||||||
 | 
					            # If we are stopping in an orderly fashion, the workers' queues
 | 
				
			||||||
 | 
					            # are empty. Put the sentinel in their queues to signal that work
 | 
				
			||||||
 | 
					            # is done, so that they can exit gracefully.
 | 
				
			||||||
 | 
					            for q in texts_q:
 | 
				
			||||||
 | 
					                q.put(_WORK_DONE_SENTINEL)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Otherwise, we are stopping because the error handler raised an
 | 
				
			||||||
 | 
					            # exception. The sentinel will be last to go out of the queue.
 | 
				
			||||||
 | 
					            # To avoid doing unnecessary work or hanging on platforms that
 | 
				
			||||||
 | 
					            # block on sending (Windows), we'll close our end of the channel.
 | 
				
			||||||
 | 
					            # This signals to the worker that it can exit the next time it
 | 
				
			||||||
 | 
					            # attempts to send data down the channel.
 | 
				
			||||||
 | 
					            for r in bytedocs_recv_ch:
 | 
				
			||||||
 | 
					                r.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            for proc in procs:
 | 
					            for proc in procs:
 | 
				
			||||||
                proc.terminate()
 | 
					                proc.join()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _link_components(self) -> None:
 | 
					    def _link_components(self) -> None:
 | 
				
			||||||
        """Register 'listeners' within pipeline components, to allow them to
 | 
					        """Register 'listeners' within pipeline components, to allow them to
 | 
				
			||||||
| 
						 | 
					@ -2323,6 +2344,11 @@ def _apply_pipes(
 | 
				
			||||||
    while True:
 | 
					    while True:
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            texts_with_ctx = receiver.get()
 | 
					            texts_with_ctx = receiver.get()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Stop working if we encounter the end-of-work sentinel.
 | 
				
			||||||
 | 
					            if isinstance(texts_with_ctx, _WorkDoneSentinel):
 | 
				
			||||||
 | 
					                return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            docs = (
 | 
					            docs = (
 | 
				
			||||||
                ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
 | 
					                ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
| 
						 | 
					@ -2331,11 +2357,21 @@ def _apply_pipes(
 | 
				
			||||||
            # Connection does not accept unpickable objects, so send list.
 | 
					            # Connection does not accept unpickable objects, so send list.
 | 
				
			||||||
            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
 | 
					            byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
 | 
				
			||||||
            padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
 | 
					            padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
 | 
				
			||||||
            sender.send(byte_docs + padding)  # type: ignore[operator]
 | 
					            data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = (
 | 
				
			||||||
 | 
					                byte_docs + padding  # type: ignore[operator]
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
        except Exception:
 | 
					        except Exception:
 | 
				
			||||||
            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
 | 
					            error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
 | 
				
			||||||
            padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
 | 
					            padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
 | 
				
			||||||
            sender.send(error_msg + padding)
 | 
					            data = error_msg + padding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            sender.send(data)
 | 
				
			||||||
 | 
					        except BrokenPipeError:
 | 
				
			||||||
 | 
					            # Parent has closed the pipe prematurely. This happens when a
 | 
				
			||||||
 | 
					            # worker encounters an error and the error handler is set to
 | 
				
			||||||
 | 
					            # stop processing.
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class _Sender:
 | 
					class _Sender:
 | 
				
			||||||
| 
						 | 
					@ -2365,3 +2401,10 @@ class _Sender:
 | 
				
			||||||
        if self.count >= self.chunk_size:
 | 
					        if self.count >= self.chunk_size:
 | 
				
			||||||
            self.count = 0
 | 
					            self.count = 0
 | 
				
			||||||
            self.send()
 | 
					            self.send()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class _WorkDoneSentinel:
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_WORK_DONE_SENTINEL = _WorkDoneSentinel()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,4 +3,4 @@ from .levenshtein import levenshtein
 | 
				
			||||||
from .matcher import Matcher
 | 
					from .matcher import Matcher
 | 
				
			||||||
from .phrasematcher import PhraseMatcher
 | 
					from .phrasematcher import PhraseMatcher
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
 | 
					__all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,21 +1,27 @@
 | 
				
			||||||
from functools import partial
 | 
					from functools import partial
 | 
				
			||||||
from typing import List, Optional, cast
 | 
					from typing import List, Optional, Tuple, cast
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.api import (
 | 
					from thinc.api import (
 | 
				
			||||||
    Dropout,
 | 
					    Dropout,
 | 
				
			||||||
 | 
					    Gelu,
 | 
				
			||||||
    LayerNorm,
 | 
					    LayerNorm,
 | 
				
			||||||
    Linear,
 | 
					    Linear,
 | 
				
			||||||
    Logistic,
 | 
					    Logistic,
 | 
				
			||||||
    Maxout,
 | 
					    Maxout,
 | 
				
			||||||
    Model,
 | 
					    Model,
 | 
				
			||||||
    ParametricAttention,
 | 
					    ParametricAttention,
 | 
				
			||||||
 | 
					    ParametricAttention_v2,
 | 
				
			||||||
    Relu,
 | 
					    Relu,
 | 
				
			||||||
    Softmax,
 | 
					    Softmax,
 | 
				
			||||||
    SparseLinear,
 | 
					    SparseLinear,
 | 
				
			||||||
 | 
					    SparseLinear_v2,
 | 
				
			||||||
    chain,
 | 
					    chain,
 | 
				
			||||||
    clone,
 | 
					    clone,
 | 
				
			||||||
    concatenate,
 | 
					    concatenate,
 | 
				
			||||||
    list2ragged,
 | 
					    list2ragged,
 | 
				
			||||||
 | 
					    reduce_first,
 | 
				
			||||||
 | 
					    reduce_last,
 | 
				
			||||||
 | 
					    reduce_max,
 | 
				
			||||||
    reduce_mean,
 | 
					    reduce_mean,
 | 
				
			||||||
    reduce_sum,
 | 
					    reduce_sum,
 | 
				
			||||||
    residual,
 | 
					    residual,
 | 
				
			||||||
| 
						 | 
					@ -25,9 +31,10 @@ from thinc.api import (
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from thinc.layers.chain import init as init_chain
 | 
					from thinc.layers.chain import init as init_chain
 | 
				
			||||||
from thinc.layers.resizable import resize_linear_weighted, resize_model
 | 
					from thinc.layers.resizable import resize_linear_weighted, resize_model
 | 
				
			||||||
from thinc.types import Floats2d
 | 
					from thinc.types import ArrayXd, Floats2d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...attrs import ORTH
 | 
					from ...attrs import ORTH
 | 
				
			||||||
 | 
					from ...errors import Errors
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from ..extract_ngrams import extract_ngrams
 | 
					from ..extract_ngrams import extract_ngrams
 | 
				
			||||||
| 
						 | 
					@ -47,39 +54,15 @@ def build_simple_cnn_text_classifier(
 | 
				
			||||||
    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
 | 
					    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
 | 
				
			||||||
    is applied instead, so that outputs are in the range [0, 1].
 | 
					    is applied instead, so that outputs are in the range [0, 1].
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    fill_defaults = {"b": 0, "W": 0}
 | 
					    return build_reduce_text_classifier(
 | 
				
			||||||
    with Model.define_operators({">>": chain}):
 | 
					        tok2vec=tok2vec,
 | 
				
			||||||
        cnn = tok2vec >> list2ragged() >> reduce_mean()
 | 
					        exclusive_classes=exclusive_classes,
 | 
				
			||||||
        nI = tok2vec.maybe_get_dim("nO")
 | 
					        use_reduce_first=False,
 | 
				
			||||||
        if exclusive_classes:
 | 
					        use_reduce_last=False,
 | 
				
			||||||
            output_layer = Softmax(nO=nO, nI=nI)
 | 
					        use_reduce_max=False,
 | 
				
			||||||
            fill_defaults["b"] = NEG_VALUE
 | 
					        use_reduce_mean=True,
 | 
				
			||||||
            resizable_layer: Model = resizable(
 | 
					        nO=nO,
 | 
				
			||||||
                output_layer,
 | 
					 | 
				
			||||||
                resize_layer=partial(
 | 
					 | 
				
			||||||
                    resize_linear_weighted, fill_defaults=fill_defaults
 | 
					 | 
				
			||||||
                ),
 | 
					 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
            model = cnn >> resizable_layer
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            output_layer = Linear(nO=nO, nI=nI)
 | 
					 | 
				
			||||||
            resizable_layer = resizable(
 | 
					 | 
				
			||||||
                output_layer,
 | 
					 | 
				
			||||||
                resize_layer=partial(
 | 
					 | 
				
			||||||
                    resize_linear_weighted, fill_defaults=fill_defaults
 | 
					 | 
				
			||||||
                ),
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            model = cnn >> resizable_layer >> Logistic()
 | 
					 | 
				
			||||||
        model.set_ref("output_layer", output_layer)
 | 
					 | 
				
			||||||
        model.attrs["resize_output"] = partial(
 | 
					 | 
				
			||||||
            resize_and_set_ref,
 | 
					 | 
				
			||||||
            resizable_layer=resizable_layer,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    model.set_ref("tok2vec", tok2vec)
 | 
					 | 
				
			||||||
    if nO is not None:
 | 
					 | 
				
			||||||
        model.set_dim("nO", cast(int, nO))
 | 
					 | 
				
			||||||
    model.attrs["multi_label"] = not exclusive_classes
 | 
					 | 
				
			||||||
    return model
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def resize_and_set_ref(model, new_nO, resizable_layer):
 | 
					def resize_and_set_ref(model, new_nO, resizable_layer):
 | 
				
			||||||
| 
						 | 
					@ -95,10 +78,48 @@ def build_bow_text_classifier(
 | 
				
			||||||
    ngram_size: int,
 | 
					    ngram_size: int,
 | 
				
			||||||
    no_output_layer: bool,
 | 
					    no_output_layer: bool,
 | 
				
			||||||
    nO: Optional[int] = None,
 | 
					    nO: Optional[int] = None,
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    return _build_bow_text_classifier(
 | 
				
			||||||
 | 
					        exclusive_classes=exclusive_classes,
 | 
				
			||||||
 | 
					        ngram_size=ngram_size,
 | 
				
			||||||
 | 
					        no_output_layer=no_output_layer,
 | 
				
			||||||
 | 
					        nO=nO,
 | 
				
			||||||
 | 
					        sparse_linear=SparseLinear(nO=nO),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures("spacy.TextCatBOW.v3")
 | 
				
			||||||
 | 
					def build_bow_text_classifier_v3(
 | 
				
			||||||
 | 
					    exclusive_classes: bool,
 | 
				
			||||||
 | 
					    ngram_size: int,
 | 
				
			||||||
 | 
					    no_output_layer: bool,
 | 
				
			||||||
 | 
					    length: int = 262144,
 | 
				
			||||||
 | 
					    nO: Optional[int] = None,
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    if length < 1:
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E1056.format(length=length))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Find k such that 2**(k-1) < length <= 2**k.
 | 
				
			||||||
 | 
					    length = 2 ** (length - 1).bit_length()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return _build_bow_text_classifier(
 | 
				
			||||||
 | 
					        exclusive_classes=exclusive_classes,
 | 
				
			||||||
 | 
					        ngram_size=ngram_size,
 | 
				
			||||||
 | 
					        no_output_layer=no_output_layer,
 | 
				
			||||||
 | 
					        nO=nO,
 | 
				
			||||||
 | 
					        sparse_linear=SparseLinear_v2(nO=nO, length=length),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _build_bow_text_classifier(
 | 
				
			||||||
 | 
					    exclusive_classes: bool,
 | 
				
			||||||
 | 
					    ngram_size: int,
 | 
				
			||||||
 | 
					    no_output_layer: bool,
 | 
				
			||||||
 | 
					    sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
 | 
				
			||||||
 | 
					    nO: Optional[int] = None,
 | 
				
			||||||
) -> Model[List[Doc], Floats2d]:
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
    fill_defaults = {"b": 0, "W": 0}
 | 
					    fill_defaults = {"b": 0, "W": 0}
 | 
				
			||||||
    with Model.define_operators({">>": chain}):
 | 
					    with Model.define_operators({">>": chain}):
 | 
				
			||||||
        sparse_linear = SparseLinear(nO=nO)
 | 
					 | 
				
			||||||
        output_layer = None
 | 
					        output_layer = None
 | 
				
			||||||
        if not no_output_layer:
 | 
					        if not no_output_layer:
 | 
				
			||||||
            fill_defaults["b"] = NEG_VALUE
 | 
					            fill_defaults["b"] = NEG_VALUE
 | 
				
			||||||
| 
						 | 
					@ -127,6 +148,9 @@ def build_text_classifier_v2(
 | 
				
			||||||
    linear_model: Model[List[Doc], Floats2d],
 | 
					    linear_model: Model[List[Doc], Floats2d],
 | 
				
			||||||
    nO: Optional[int] = None,
 | 
					    nO: Optional[int] = None,
 | 
				
			||||||
) -> Model[List[Doc], Floats2d]:
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    # TODO: build the model with _build_parametric_attention_with_residual_nonlinear
 | 
				
			||||||
 | 
					    # in spaCy v4. We don't do this in spaCy v3 to preserve model
 | 
				
			||||||
 | 
					    # compatibility.
 | 
				
			||||||
    exclusive_classes = not linear_model.attrs["multi_label"]
 | 
					    exclusive_classes = not linear_model.attrs["multi_label"]
 | 
				
			||||||
    with Model.define_operators({">>": chain, "|": concatenate}):
 | 
					    with Model.define_operators({">>": chain, "|": concatenate}):
 | 
				
			||||||
        width = tok2vec.maybe_get_dim("nO")
 | 
					        width = tok2vec.maybe_get_dim("nO")
 | 
				
			||||||
| 
						 | 
					@ -190,3 +214,145 @@ def build_text_classifier_lowdata(
 | 
				
			||||||
            model = model >> Dropout(dropout)
 | 
					            model = model >> Dropout(dropout)
 | 
				
			||||||
        model = model >> Logistic()
 | 
					        model = model >> Logistic()
 | 
				
			||||||
    return model
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures("spacy.TextCatParametricAttention.v1")
 | 
				
			||||||
 | 
					def build_textcat_parametric_attention_v1(
 | 
				
			||||||
 | 
					    tok2vec: Model[List[Doc], List[Floats2d]],
 | 
				
			||||||
 | 
					    exclusive_classes: bool,
 | 
				
			||||||
 | 
					    nO: Optional[int] = None,
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    width = tok2vec.maybe_get_dim("nO")
 | 
				
			||||||
 | 
					    parametric_attention = _build_parametric_attention_with_residual_nonlinear(
 | 
				
			||||||
 | 
					        tok2vec=tok2vec,
 | 
				
			||||||
 | 
					        nonlinear_layer=Maxout(nI=width, nO=width),
 | 
				
			||||||
 | 
					        key_transform=Gelu(nI=width, nO=width),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    with Model.define_operators({">>": chain}):
 | 
				
			||||||
 | 
					        if exclusive_classes:
 | 
				
			||||||
 | 
					            output_layer = Softmax(nO=nO)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            output_layer = Linear(nO=nO) >> Logistic()
 | 
				
			||||||
 | 
					        model = parametric_attention >> output_layer
 | 
				
			||||||
 | 
					    if model.has_dim("nO") is not False and nO is not None:
 | 
				
			||||||
 | 
					        model.set_dim("nO", cast(int, nO))
 | 
				
			||||||
 | 
					    model.set_ref("output_layer", output_layer)
 | 
				
			||||||
 | 
					    model.attrs["multi_label"] = not exclusive_classes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _build_parametric_attention_with_residual_nonlinear(
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    tok2vec: Model[List[Doc], List[Floats2d]],
 | 
				
			||||||
 | 
					    nonlinear_layer: Model[Floats2d, Floats2d],
 | 
				
			||||||
 | 
					    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    with Model.define_operators({">>": chain, "|": concatenate}):
 | 
				
			||||||
 | 
					        width = tok2vec.maybe_get_dim("nO")
 | 
				
			||||||
 | 
					        attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
 | 
				
			||||||
 | 
					        norm_layer = LayerNorm(nI=width)
 | 
				
			||||||
 | 
					        parametric_attention = (
 | 
				
			||||||
 | 
					            tok2vec
 | 
				
			||||||
 | 
					            >> list2ragged()
 | 
				
			||||||
 | 
					            >> attention_layer
 | 
				
			||||||
 | 
					            >> reduce_sum()
 | 
				
			||||||
 | 
					            >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        parametric_attention.set_ref("tok2vec", tok2vec)
 | 
				
			||||||
 | 
					        parametric_attention.set_ref("attention_layer", attention_layer)
 | 
				
			||||||
 | 
					        parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
 | 
				
			||||||
 | 
					        parametric_attention.set_ref("norm_layer", norm_layer)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return parametric_attention
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
 | 
				
			||||||
 | 
					    tok2vec_width = get_tok2vec_width(model)
 | 
				
			||||||
 | 
					    model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
 | 
				
			||||||
 | 
					    model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
 | 
				
			||||||
 | 
					    model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
 | 
				
			||||||
 | 
					    model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
 | 
				
			||||||
 | 
					    model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
 | 
				
			||||||
 | 
					    init_chain(model, X, Y)
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.architectures("spacy.TextCatReduce.v1")
 | 
				
			||||||
 | 
					def build_reduce_text_classifier(
 | 
				
			||||||
 | 
					    tok2vec: Model,
 | 
				
			||||||
 | 
					    exclusive_classes: bool,
 | 
				
			||||||
 | 
					    use_reduce_first: bool,
 | 
				
			||||||
 | 
					    use_reduce_last: bool,
 | 
				
			||||||
 | 
					    use_reduce_max: bool,
 | 
				
			||||||
 | 
					    use_reduce_mean: bool,
 | 
				
			||||||
 | 
					    nO: Optional[int] = None,
 | 
				
			||||||
 | 
					) -> Model[List[Doc], Floats2d]:
 | 
				
			||||||
 | 
					    """Build a model that classifies pooled `Doc` representations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Pooling is performed using reductions. Reductions are concatenated when
 | 
				
			||||||
 | 
					    multiple reductions are used.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tok2vec (Model): the tok2vec layer to pool over.
 | 
				
			||||||
 | 
					    exclusive_classes (bool): Whether or not classes are mutually exclusive.
 | 
				
			||||||
 | 
					    use_reduce_first (bool): Pool by using the hidden representation of the
 | 
				
			||||||
 | 
					        first token of a `Doc`.
 | 
				
			||||||
 | 
					    use_reduce_last (bool): Pool by using the hidden representation of the
 | 
				
			||||||
 | 
					        last token of a `Doc`.
 | 
				
			||||||
 | 
					    use_reduce_max (bool): Pool by taking the maximum values of the hidden
 | 
				
			||||||
 | 
					        representations of a `Doc`.
 | 
				
			||||||
 | 
					    use_reduce_mean (bool): Pool by taking the mean of all hidden
 | 
				
			||||||
 | 
					        representations of a `Doc`.
 | 
				
			||||||
 | 
					    nO (Optional[int]): Number of classes.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fill_defaults = {"b": 0, "W": 0}
 | 
				
			||||||
 | 
					    reductions = []
 | 
				
			||||||
 | 
					    if use_reduce_first:
 | 
				
			||||||
 | 
					        reductions.append(reduce_first())
 | 
				
			||||||
 | 
					    if use_reduce_last:
 | 
				
			||||||
 | 
					        reductions.append(reduce_last())
 | 
				
			||||||
 | 
					    if use_reduce_max:
 | 
				
			||||||
 | 
					        reductions.append(reduce_max())
 | 
				
			||||||
 | 
					    if use_reduce_mean:
 | 
				
			||||||
 | 
					        reductions.append(reduce_mean())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if not len(reductions):
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E1057)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with Model.define_operators({">>": chain}):
 | 
				
			||||||
 | 
					        cnn = tok2vec >> list2ragged() >> concatenate(*reductions)
 | 
				
			||||||
 | 
					        nO_tok2vec = tok2vec.maybe_get_dim("nO")
 | 
				
			||||||
 | 
					        nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None
 | 
				
			||||||
 | 
					        if exclusive_classes:
 | 
				
			||||||
 | 
					            output_layer = Softmax(nO=nO, nI=nI)
 | 
				
			||||||
 | 
					            fill_defaults["b"] = NEG_VALUE
 | 
				
			||||||
 | 
					            resizable_layer: Model = resizable(
 | 
				
			||||||
 | 
					                output_layer,
 | 
				
			||||||
 | 
					                resize_layer=partial(
 | 
				
			||||||
 | 
					                    resize_linear_weighted, fill_defaults=fill_defaults
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            model = cnn >> resizable_layer
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            output_layer = Linear(nO=nO, nI=nI)
 | 
				
			||||||
 | 
					            resizable_layer = resizable(
 | 
				
			||||||
 | 
					                output_layer,
 | 
				
			||||||
 | 
					                resize_layer=partial(
 | 
				
			||||||
 | 
					                    resize_linear_weighted, fill_defaults=fill_defaults
 | 
				
			||||||
 | 
					                ),
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            model = cnn >> resizable_layer >> Logistic()
 | 
				
			||||||
 | 
					        model.set_ref("output_layer", output_layer)
 | 
				
			||||||
 | 
					        model.attrs["resize_output"] = partial(
 | 
				
			||||||
 | 
					            resize_and_set_ref,
 | 
				
			||||||
 | 
					            resizable_layer=resizable_layer,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    model.set_ref("tok2vec", tok2vec)
 | 
				
			||||||
 | 
					    if nO is not None:
 | 
				
			||||||
 | 
					        model.set_dim("nO", cast(int, nO))
 | 
				
			||||||
 | 
					    model.attrs["multi_label"] = not exclusive_classes
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,6 +22,7 @@ from .trainable_pipe import TrainablePipe
 | 
				
			||||||
__all__ = [
 | 
					__all__ = [
 | 
				
			||||||
    "AttributeRuler",
 | 
					    "AttributeRuler",
 | 
				
			||||||
    "DependencyParser",
 | 
					    "DependencyParser",
 | 
				
			||||||
 | 
					    "EditTreeLemmatizer",
 | 
				
			||||||
    "EntityLinker",
 | 
					    "EntityLinker",
 | 
				
			||||||
    "EntityRecognizer",
 | 
					    "EntityRecognizer",
 | 
				
			||||||
    "EntityRuler",
 | 
					    "EntityRuler",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -29,7 +29,7 @@ cdef class StateClass:
 | 
				
			||||||
        return [self.B(i) for i in range(self.c.buffer_length())]
 | 
					        return [self.B(i) for i in range(self.c.buffer_length())]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def token_vector_lenth(self):
 | 
					    def token_vector_length(self):
 | 
				
			||||||
        return self.doc.tensor.shape[1]
 | 
					        return self.doc.tensor.shape[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -36,8 +36,9 @@ maxout_pieces = 3
 | 
				
			||||||
depth = 2
 | 
					depth = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[model.linear_model]
 | 
					[model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
| 
						 | 
					@ -45,16 +46,21 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
 | 
				
			||||||
 | 
					
 | 
				
			||||||
single_label_bow_config = """
 | 
					single_label_bow_config = """
 | 
				
			||||||
[model]
 | 
					[model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
single_label_cnn_config = """
 | 
					single_label_cnn_config = """
 | 
				
			||||||
[model]
 | 
					[model]
 | 
				
			||||||
@architectures = "spacy.TextCatCNN.v2"
 | 
					@architectures = "spacy.TextCatReduce.v1"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					use_reduce_first = false
 | 
				
			||||||
 | 
					use_reduce_last = false
 | 
				
			||||||
 | 
					use_reduce_max = false
 | 
				
			||||||
 | 
					use_reduce_mean = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[model.tok2vec]
 | 
					[model.tok2vec]
 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v2"
 | 
					@architectures = "spacy.HashEmbedCNN.v2"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,8 +35,9 @@ maxout_pieces = 3
 | 
				
			||||||
depth = 2
 | 
					depth = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[model.linear_model]
 | 
					[model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
| 
						 | 
					@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
 | 
				
			||||||
 | 
					
 | 
				
			||||||
multi_label_bow_config = """
 | 
					multi_label_bow_config = """
 | 
				
			||||||
[model]
 | 
					[model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
| 
						 | 
					@ -52,8 +53,12 @@ no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
multi_label_cnn_config = """
 | 
					multi_label_cnn_config = """
 | 
				
			||||||
[model]
 | 
					[model]
 | 
				
			||||||
@architectures = "spacy.TextCatCNN.v2"
 | 
					@architectures = "spacy.TextCatReduce.v1"
 | 
				
			||||||
exclusive_classes = false
 | 
					exclusive_classes = false
 | 
				
			||||||
 | 
					use_reduce_first = false
 | 
				
			||||||
 | 
					use_reduce_last = false
 | 
				
			||||||
 | 
					use_reduce_max = false
 | 
				
			||||||
 | 
					use_reduce_mean = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[model.tok2vec]
 | 
					[model.tok2vec]
 | 
				
			||||||
@architectures = "spacy.HashEmbedCNN.v2"
 | 
					@architectures = "spacy.HashEmbedCNN.v2"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										138
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						| 
						 | 
					@ -802,6 +802,140 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# The following implementation of trapezoid() is adapted from SciPy,
 | 
				
			||||||
 | 
					# which is distributed under the New BSD License.
 | 
				
			||||||
 | 
					# Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
 | 
				
			||||||
 | 
					# See licenses/3rd_party_licenses.txt
 | 
				
			||||||
 | 
					def trapezoid(y, x=None, dx=1.0, axis=-1):
 | 
				
			||||||
 | 
					    r"""
 | 
				
			||||||
 | 
					    Integrate along the given axis using the composite trapezoidal rule.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    If `x` is provided, the integration happens in sequence along its
 | 
				
			||||||
 | 
					    elements - they are not sorted.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Integrate `y` (`x`) along each 1d slice on the given axis, compute
 | 
				
			||||||
 | 
					    :math:`\int y(x) dx`.
 | 
				
			||||||
 | 
					    When `x` is specified, this integrates along the parametric curve,
 | 
				
			||||||
 | 
					    computing :math:`\int_t y(t) dt =
 | 
				
			||||||
 | 
					    \int_t y(t) \left.\frac{dx}{dt}\right|_{x=x(t)} dt`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Parameters
 | 
				
			||||||
 | 
					    ----------
 | 
				
			||||||
 | 
					    y : array_like
 | 
				
			||||||
 | 
					        Input array to integrate.
 | 
				
			||||||
 | 
					    x : array_like, optional
 | 
				
			||||||
 | 
					        The sample points corresponding to the `y` values. If `x` is None,
 | 
				
			||||||
 | 
					        the sample points are assumed to be evenly spaced `dx` apart. The
 | 
				
			||||||
 | 
					        default is None.
 | 
				
			||||||
 | 
					    dx : scalar, optional
 | 
				
			||||||
 | 
					        The spacing between sample points when `x` is None. The default is 1.
 | 
				
			||||||
 | 
					    axis : int, optional
 | 
				
			||||||
 | 
					        The axis along which to integrate.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns
 | 
				
			||||||
 | 
					    -------
 | 
				
			||||||
 | 
					    trapezoid : float or ndarray
 | 
				
			||||||
 | 
					        Definite integral of `y` = n-dimensional array as approximated along
 | 
				
			||||||
 | 
					        a single axis by the trapezoidal rule. If `y` is a 1-dimensional array,
 | 
				
			||||||
 | 
					        then the result is a float. If `n` is greater than 1, then the result
 | 
				
			||||||
 | 
					        is an `n`-1 dimensional array.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See Also
 | 
				
			||||||
 | 
					    --------
 | 
				
			||||||
 | 
					    cumulative_trapezoid, simpson, romb
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Notes
 | 
				
			||||||
 | 
					    -----
 | 
				
			||||||
 | 
					    Image [2]_ illustrates trapezoidal rule -- y-axis locations of points
 | 
				
			||||||
 | 
					    will be taken from `y` array, by default x-axis distances between
 | 
				
			||||||
 | 
					    points will be 1.0, alternatively they can be provided with `x` array
 | 
				
			||||||
 | 
					    or with `dx` scalar.  Return value will be equal to combined area under
 | 
				
			||||||
 | 
					    the red lines.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    References
 | 
				
			||||||
 | 
					    ----------
 | 
				
			||||||
 | 
					    .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    .. [2] Illustration image:
 | 
				
			||||||
 | 
					           https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Examples
 | 
				
			||||||
 | 
					    --------
 | 
				
			||||||
 | 
					    Use the trapezoidal rule on evenly spaced points:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    >>> import numpy as np
 | 
				
			||||||
 | 
					    >>> from scipy import integrate
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid([1, 2, 3])
 | 
				
			||||||
 | 
					    4.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    The spacing between sample points can be selected by either the
 | 
				
			||||||
 | 
					    ``x`` or ``dx`` arguments:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid([1, 2, 3], x=[4, 6, 8])
 | 
				
			||||||
 | 
					    8.0
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid([1, 2, 3], dx=2)
 | 
				
			||||||
 | 
					    8.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Using a decreasing ``x`` corresponds to integrating in reverse:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid([1, 2, 3], x=[8, 6, 4])
 | 
				
			||||||
 | 
					    -8.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    More generally ``x`` is used to integrate along a parametric curve. We can
 | 
				
			||||||
 | 
					    estimate the integral :math:`\int_0^1 x^2 = 1/3` using:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    >>> x = np.linspace(0, 1, num=50)
 | 
				
			||||||
 | 
					    >>> y = x**2
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid(y, x)
 | 
				
			||||||
 | 
					    0.33340274885464394
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Or estimate the area of a circle, noting we repeat the sample which closes
 | 
				
			||||||
 | 
					    the curve:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    >>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True)
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid(np.cos(theta), x=np.sin(theta))
 | 
				
			||||||
 | 
					    3.141571941375841
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ``trapezoid`` can be applied along a specified axis to do multiple
 | 
				
			||||||
 | 
					    computations in one call:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    >>> a = np.arange(6).reshape(2, 3)
 | 
				
			||||||
 | 
					    >>> a
 | 
				
			||||||
 | 
					    array([[0, 1, 2],
 | 
				
			||||||
 | 
					           [3, 4, 5]])
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid(a, axis=0)
 | 
				
			||||||
 | 
					    array([1.5, 2.5, 3.5])
 | 
				
			||||||
 | 
					    >>> integrate.trapezoid(a, axis=1)
 | 
				
			||||||
 | 
					    array([2.,  8.])
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    y = np.asanyarray(y)
 | 
				
			||||||
 | 
					    if x is None:
 | 
				
			||||||
 | 
					        d = dx
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        x = np.asanyarray(x)
 | 
				
			||||||
 | 
					        if x.ndim == 1:
 | 
				
			||||||
 | 
					            d = np.diff(x)
 | 
				
			||||||
 | 
					            # reshape to correct shape
 | 
				
			||||||
 | 
					            shape = [1] * y.ndim
 | 
				
			||||||
 | 
					            shape[axis] = d.shape[0]
 | 
				
			||||||
 | 
					            d = d.reshape(shape)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            d = np.diff(x, axis=axis)
 | 
				
			||||||
 | 
					    nd = y.ndim
 | 
				
			||||||
 | 
					    slice1 = [slice(None)] * nd
 | 
				
			||||||
 | 
					    slice2 = [slice(None)] * nd
 | 
				
			||||||
 | 
					    slice1[axis] = slice(1, None)
 | 
				
			||||||
 | 
					    slice2[axis] = slice(None, -1)
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis)
 | 
				
			||||||
 | 
					    except ValueError:
 | 
				
			||||||
 | 
					        # Operations didn't work, cast to ndarray
 | 
				
			||||||
 | 
					        d = np.asarray(d)
 | 
				
			||||||
 | 
					        y = np.asarray(y)
 | 
				
			||||||
 | 
					        ret = np.add.reduce(d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0, axis)
 | 
				
			||||||
 | 
					    return ret
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The following implementation of roc_auc_score() is adapted from
 | 
					# The following implementation of roc_auc_score() is adapted from
 | 
				
			||||||
# scikit-learn, which is distributed under the New BSD License.
 | 
					# scikit-learn, which is distributed under the New BSD License.
 | 
				
			||||||
# Copyright (c) 2007–2019 The scikit-learn developers.
 | 
					# Copyright (c) 2007–2019 The scikit-learn developers.
 | 
				
			||||||
| 
						 | 
					@ -1024,9 +1158,9 @@ def _auc(x, y):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            raise ValueError(Errors.E164.format(x=x))
 | 
					            raise ValueError(Errors.E164.format(x=x))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    area = direction * np.trapz(y, x)
 | 
					    area = direction * trapezoid(y, x)
 | 
				
			||||||
    if isinstance(area, np.memmap):
 | 
					    if isinstance(area, np.memmap):
 | 
				
			||||||
        # Reductions such as .sum used internally in np.trapz do not return a
 | 
					        # Reductions such as .sum used internally in trapezoid do not return a
 | 
				
			||||||
        # scalar by default for numpy.memmap instances contrary to
 | 
					        # scalar by default for numpy.memmap instances contrary to
 | 
				
			||||||
        # regular numpy.ndarray instances.
 | 
					        # regular numpy.ndarray instances.
 | 
				
			||||||
        area = area.dtype.type(area)
 | 
					        area = area.dtype.type(area)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -162,6 +162,11 @@ def fi_tokenizer():
 | 
				
			||||||
    return get_lang_class("fi")().tokenizer
 | 
					    return get_lang_class("fi")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
 | 
					def fo_tokenizer():
 | 
				
			||||||
 | 
					    return get_lang_class("fo")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def fr_tokenizer():
 | 
					def fr_tokenizer():
 | 
				
			||||||
    return get_lang_class("fr")().tokenizer
 | 
					    return get_lang_class("fr")().tokenizer
 | 
				
			||||||
| 
						 | 
					@ -317,6 +322,11 @@ def nl_tokenizer():
 | 
				
			||||||
    return get_lang_class("nl")().tokenizer
 | 
					    return get_lang_class("nl")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
 | 
					def nn_tokenizer():
 | 
				
			||||||
 | 
					    return get_lang_class("nn")().tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def pl_tokenizer():
 | 
					def pl_tokenizer():
 | 
				
			||||||
    return get_lang_class("pl")().tokenizer
 | 
					    return get_lang_class("pl")().tokenizer
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -731,3 +731,12 @@ def test_for_no_ent_sents():
 | 
				
			||||||
    sents = list(doc.ents[0].sents)
 | 
					    sents = list(doc.ents[0].sents)
 | 
				
			||||||
    assert len(sents) == 1
 | 
					    assert len(sents) == 1
 | 
				
			||||||
    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
 | 
					    assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_span_api_richcmp_other(en_tokenizer):
 | 
				
			||||||
 | 
					    doc1 = en_tokenizer("a b")
 | 
				
			||||||
 | 
					    doc2 = en_tokenizer("b c")
 | 
				
			||||||
 | 
					    assert not doc1[1:2] == doc1[1]
 | 
				
			||||||
 | 
					    assert not doc1[1:2] == doc2[0]
 | 
				
			||||||
 | 
					    assert not doc1[1:2] == doc2[0:1]
 | 
				
			||||||
 | 
					    assert not doc1[0:1] == doc2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -294,3 +294,12 @@ def test_missing_head_dep(en_vocab):
 | 
				
			||||||
    assert aligned_heads[0] == ref_heads[0]
 | 
					    assert aligned_heads[0] == ref_heads[0]
 | 
				
			||||||
    assert aligned_deps[5] == ref_deps[5]
 | 
					    assert aligned_deps[5] == ref_deps[5]
 | 
				
			||||||
    assert aligned_heads[5] == ref_heads[5]
 | 
					    assert aligned_heads[5] == ref_heads[5]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_token_api_richcmp_other(en_tokenizer):
 | 
				
			||||||
 | 
					    doc1 = en_tokenizer("a b")
 | 
				
			||||||
 | 
					    doc2 = en_tokenizer("b c")
 | 
				
			||||||
 | 
					    assert not doc1[1] == doc1[0:1]
 | 
				
			||||||
 | 
					    assert not doc1[1] == doc2[1:2]
 | 
				
			||||||
 | 
					    assert not doc1[1] == doc2[0]
 | 
				
			||||||
 | 
					    assert not doc1[0] == doc2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										26
									
								
								spacy/tests/lang/fo/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,26 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
 | 
				
			||||||
 | 
					# fmt: off
 | 
				
			||||||
 | 
					FO_TOKEN_EXCEPTION_TESTS = [
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ",
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            "Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.",
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            "Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS)
 | 
				
			||||||
 | 
					def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens):
 | 
				
			||||||
 | 
					    tokens = fo_tokenizer(text)
 | 
				
			||||||
 | 
					    token_list = [token.text for token in tokens if not token.is_space]
 | 
				
			||||||
 | 
					    assert expected_tokens == token_list
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										38
									
								
								spacy/tests/lang/nn/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,38 @@
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
 | 
				
			||||||
 | 
					# fmt: off
 | 
				
			||||||
 | 
					NN_TOKEN_EXCEPTION_TESTS = [
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.",
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            "Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.",
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            "Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.",
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            "Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					    (
 | 
				
			||||||
 | 
					        "Brukssesongen er frå nov. til mai, med ein topp i mars.",
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            "Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".",
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					    ),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS)
 | 
				
			||||||
 | 
					def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens):
 | 
				
			||||||
 | 
					    tokens = nn_tokenizer(text)
 | 
				
			||||||
 | 
					    token_list = [token.text for token in tokens if not token.is_space]
 | 
				
			||||||
 | 
					    assert expected_tokens == token_list
 | 
				
			||||||
| 
						 | 
					@ -203,7 +203,7 @@ def test_pipe_class_component_model():
 | 
				
			||||||
            "@architectures": "spacy.TextCatEnsemble.v2",
 | 
					            "@architectures": "spacy.TextCatEnsemble.v2",
 | 
				
			||||||
            "tok2vec": DEFAULT_TOK2VEC_MODEL,
 | 
					            "tok2vec": DEFAULT_TOK2VEC_MODEL,
 | 
				
			||||||
            "linear_model": {
 | 
					            "linear_model": {
 | 
				
			||||||
                "@architectures": "spacy.TextCatBOW.v2",
 | 
					                "@architectures": "spacy.TextCatBOW.v3",
 | 
				
			||||||
                "exclusive_classes": False,
 | 
					                "exclusive_classes": False,
 | 
				
			||||||
                "ngram_size": 1,
 | 
					                "ngram_size": 1,
 | 
				
			||||||
                "no_output_layer": False,
 | 
					                "no_output_layer": False,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "name,textcat_config",
 | 
					    "name,textcat_config",
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        # BOW
 | 
					        # BOW V1
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
 | 
				
			||||||
| 
						 | 
					@ -451,14 +451,14 @@ def test_no_resize(name, textcat_config):
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "name,textcat_config",
 | 
					    "name,textcat_config",
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        # BOW
 | 
					        # BOW V3
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
 | 
				
			||||||
        # CNN
 | 
					        # CNN
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
| 
						 | 
					@ -480,14 +480,14 @@ def test_resize(name, textcat_config):
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "name,textcat_config",
 | 
					    "name,textcat_config",
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        # BOW
 | 
					        # BOW v3
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
 | 
				
			||||||
        # CNN
 | 
					        # REDUCE
 | 
				
			||||||
        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
 | 
					        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
 | 
				
			||||||
        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
 | 
					        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
| 
						 | 
					@ -693,12 +693,23 @@ def test_overfitting_IO_multi():
 | 
				
			||||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
 | 
				
			||||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
 | 
				
			||||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
 | 
				
			||||||
 | 
					        # BOW V3
 | 
				
			||||||
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
 | 
				
			||||||
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
 | 
				
			||||||
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
 | 
				
			||||||
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
 | 
				
			||||||
        # ENSEMBLE V2
 | 
					        # ENSEMBLE V2
 | 
				
			||||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
 | 
				
			||||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
 | 
				
			||||||
        # CNN V2
 | 
					        # CNN V2 (legacy)
 | 
				
			||||||
        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
 | 
				
			||||||
        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
 | 
				
			||||||
 | 
					        # PARAMETRIC ATTENTION V1
 | 
				
			||||||
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
 | 
				
			||||||
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
 | 
				
			||||||
 | 
					        # REDUCE V1
 | 
				
			||||||
 | 
					        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
 | 
				
			||||||
 | 
					        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer):
 | 
				
			||||||
    Token.set_extension("_test_token", default="t0")
 | 
					    Token.set_extension("_test_token", default="t0")
 | 
				
			||||||
    doc[1]._._test_token = "t1"
 | 
					    doc[1]._._test_token = "t1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return doc
 | 
					    yield doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Doc.remove_extension("_test_attr")
 | 
				
			||||||
 | 
					    Doc.remove_extension("_test_prop")
 | 
				
			||||||
 | 
					    Doc.remove_extension("_test_method")
 | 
				
			||||||
 | 
					    Token.remove_extension("_test_token")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
 | 
					def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1061,3 +1061,8 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
 | 
					    data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
 | 
				
			||||||
    assert data["no_lemma_annotations"] == 2
 | 
					    assert data["no_lemma_annotations"] == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_project_api_imports():
 | 
				
			||||||
 | 
					    from spacy.cli import project_run
 | 
				
			||||||
 | 
					    from spacy.cli.project.run import project_run  # noqa: F401, F811
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -214,9 +214,6 @@ def test_project_clone(options):
 | 
				
			||||||
        assert (out / "README.md").is_file()
 | 
					        assert (out / "README.md").is_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.skipif(
 | 
					 | 
				
			||||||
    sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes"
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
def test_project_push_pull(project_dir):
 | 
					def test_project_push_pull(project_dir):
 | 
				
			||||||
    proj = dict(SAMPLE_PROJECT)
 | 
					    proj = dict(SAMPLE_PROJECT)
 | 
				
			||||||
    remote = "xyz"
 | 
					    remote = "xyz"
 | 
				
			||||||
| 
						 | 
					@ -241,7 +238,7 @@ def test_project_push_pull(project_dir):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_find_function_valid():
 | 
					def test_find_function_valid():
 | 
				
			||||||
    # example of architecture in main code base
 | 
					    # example of architecture in main code base
 | 
				
			||||||
    function = "spacy.TextCatBOW.v2"
 | 
					    function = "spacy.TextCatBOW.v3"
 | 
				
			||||||
    result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
 | 
					    result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
 | 
				
			||||||
    assert f"Found registered function '{function}'" in result.stdout
 | 
					    assert f"Found registered function '{function}'" in result.stdout
 | 
				
			||||||
    assert "textcat.py" in result.stdout
 | 
					    assert "textcat.py" in result.stdout
 | 
				
			||||||
| 
						 | 
					@ -260,7 +257,7 @@ def test_find_function_valid():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_find_function_invalid():
 | 
					def test_find_function_invalid():
 | 
				
			||||||
    # invalid registry
 | 
					    # invalid registry
 | 
				
			||||||
    function = "spacy.TextCatBOW.v2"
 | 
					    function = "spacy.TextCatBOW.v3"
 | 
				
			||||||
    registry = "foobar"
 | 
					    registry = "foobar"
 | 
				
			||||||
    result = CliRunner().invoke(
 | 
					    result = CliRunner().invoke(
 | 
				
			||||||
        app, ["find-function", function, "--registry", registry]
 | 
					        app, ["find-function", function, "--registry", registry]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,7 @@ import numpy
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import displacy
 | 
					from spacy import displacy
 | 
				
			||||||
from spacy.displacy.render import DependencyRenderer, EntityRenderer
 | 
					from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.lang.fa import Persian
 | 
					from spacy.lang.fa import Persian
 | 
				
			||||||
from spacy.tokens import Doc, Span
 | 
					from spacy.tokens import Doc, Span
 | 
				
			||||||
| 
						 | 
					@ -468,3 +468,23 @@ def test_issue12816(en_vocab) -> None:
 | 
				
			||||||
    # Verify that the HTML tag is still escaped
 | 
					    # Verify that the HTML tag is still escaped
 | 
				
			||||||
    html = displacy.render(doc, style="span")
 | 
					    html = displacy.render(doc, style="span")
 | 
				
			||||||
    assert "<TEST>" in html
 | 
					    assert "<TEST>" in html
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.issue(13056)
 | 
				
			||||||
 | 
					def test_displacy_span_stacking():
 | 
				
			||||||
 | 
					    """Test whether span stacking works properly for multiple overlapping spans."""
 | 
				
			||||||
 | 
					    spans = [
 | 
				
			||||||
 | 
					        {"start_token": 2, "end_token": 5, "label": "SkillNC"},
 | 
				
			||||||
 | 
					        {"start_token": 0, "end_token": 2, "label": "Skill"},
 | 
				
			||||||
 | 
					        {"start_token": 1, "end_token": 3, "label": "Skill"},
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."]
 | 
				
			||||||
 | 
					    per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert len(per_token_info) == len(tokens)
 | 
				
			||||||
 | 
					    assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)])
 | 
				
			||||||
 | 
					    assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)])
 | 
				
			||||||
 | 
					    assert per_token_info[1]["entities"][0]["render_slot"] == 1
 | 
				
			||||||
 | 
					    assert per_token_info[1]["entities"][1]["render_slot"] == 2
 | 
				
			||||||
 | 
					    assert per_token_info[2]["entities"][0]["render_slot"] == 2
 | 
				
			||||||
 | 
					    assert per_token_info[2]["entities"][1]["render_slot"] == 3
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -376,8 +376,9 @@ def test_util_dot_section():
 | 
				
			||||||
    factory = "textcat"
 | 
					    factory = "textcat"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    [components.textcat.model]
 | 
					    [components.textcat.model]
 | 
				
			||||||
    @architectures = "spacy.TextCatBOW.v2"
 | 
					    @architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
    exclusive_classes = true
 | 
					    exclusive_classes = true
 | 
				
			||||||
 | 
					    length = 262144
 | 
				
			||||||
    ngram_size = 1
 | 
					    ngram_size = 1
 | 
				
			||||||
    no_output_layer = false
 | 
					    no_output_layer = false
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -485,8 +486,8 @@ def test_to_ternary_int():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_find_available_port():
 | 
					def test_find_available_port():
 | 
				
			||||||
    host = "0.0.0.0"
 | 
					    host = "0.0.0.0"
 | 
				
			||||||
    port = 5000
 | 
					    port = 5001
 | 
				
			||||||
    assert find_available_port(port, host) == port, "Port 5000 isn't free"
 | 
					    assert find_available_port(port, host) == port, "Port 5001 isn't free"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    from wsgiref.simple_server import demo_app, make_server
 | 
					    from wsgiref.simple_server import demo_app, make_server
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -26,6 +26,7 @@ from spacy.ml.models import (
 | 
				
			||||||
    build_Tok2Vec_model,
 | 
					    build_Tok2Vec_model,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from spacy.ml.staticvectors import StaticVectors
 | 
					from spacy.ml.staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from spacy.util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_textcat_bow_kwargs():
 | 
					def get_textcat_bow_kwargs():
 | 
				
			||||||
| 
						 | 
					@ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5):
 | 
				
			||||||
    Y, backprop = model((docs, spans), is_train=True)
 | 
					    Y, backprop = model((docs, spans), is_train=True)
 | 
				
			||||||
    assert Y.shape == (spans.dataXd.shape[0], nO)
 | 
					    assert Y.shape == (spans.dataXd.shape[0], nO)
 | 
				
			||||||
    backprop(Y)
 | 
					    backprop(Y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_textcat_reduce_invalid_args():
 | 
				
			||||||
 | 
					    textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1")
 | 
				
			||||||
 | 
					    tok2vec = make_test_tok2vec()
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError, match=r"must be used with at least one reduction"):
 | 
				
			||||||
 | 
					        textcat_reduce(
 | 
				
			||||||
 | 
					            tok2vec=tok2vec,
 | 
				
			||||||
 | 
					            exclusive_classes=False,
 | 
				
			||||||
 | 
					            use_reduce_first=False,
 | 
				
			||||||
 | 
					            use_reduce_last=False,
 | 
				
			||||||
 | 
					            use_reduce_max=False,
 | 
				
			||||||
 | 
					            use_reduce_mean=False,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
 | 
				
			||||||
    assert tokens == explain_tokens
 | 
					    assert tokens == explain_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
 | 
				
			||||||
 | 
					    rules = {":]": [{"ORTH": ":]"}]}
 | 
				
			||||||
 | 
					    tokenizer = Tokenizer(
 | 
				
			||||||
 | 
					        en_vocab,
 | 
				
			||||||
 | 
					        rules=rules,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    text = ": ]"
 | 
				
			||||||
 | 
					    tokens = [t.text for t in tokenizer(text)]
 | 
				
			||||||
 | 
					    explain_tokens = [t[1] for t in tokenizer.explain(text)]
 | 
				
			||||||
 | 
					    assert tokens == explain_tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@hypothesis.strategies.composite
 | 
					@hypothesis.strategies.composite
 | 
				
			||||||
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
 | 
					def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokenizer: Tokenizer = spacy.blank(lang).tokenizer
 | 
					    tokenizer: Tokenizer = spacy.blank(lang).tokenizer
 | 
				
			||||||
    tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
 | 
					    # Tokenizer.explain is not intended to handle whitespace or control
 | 
				
			||||||
 | 
					    # characters in the same way as Tokenizer
 | 
				
			||||||
 | 
					    sentence = re.sub(r"\s+", " ", sentence).strip()
 | 
				
			||||||
 | 
					    tokens = [t.text for t in tokenizer(sentence)]
 | 
				
			||||||
    debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
 | 
					    debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
 | 
				
			||||||
    assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
 | 
					    assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -730,6 +730,13 @@ cdef class Tokenizer:
 | 
				
			||||||
            if i in spans_by_start:
 | 
					            if i in spans_by_start:
 | 
				
			||||||
                span = spans_by_start[i]
 | 
					                span = spans_by_start[i]
 | 
				
			||||||
                exc = [d[ORTH] for d in special_cases[span.label_]]
 | 
					                exc = [d[ORTH] for d in special_cases[span.label_]]
 | 
				
			||||||
 | 
					                # The phrase matcher can overmatch for tokens separated by
 | 
				
			||||||
 | 
					                # spaces in the text but not in the underlying rule, so skip
 | 
				
			||||||
 | 
					                # cases where the texts aren't identical
 | 
				
			||||||
 | 
					                if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
 | 
				
			||||||
 | 
					                    final_tokens.append(tokens[i])
 | 
				
			||||||
 | 
					                    i += 1
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
                    for j, orth in enumerate(exc):
 | 
					                    for j, orth in enumerate(exc):
 | 
				
			||||||
                        final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
 | 
					                        final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
 | 
				
			||||||
                    i += len(span)
 | 
					                    i += len(span)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,4 +5,4 @@ from .span import Span
 | 
				
			||||||
from .span_group import SpanGroup
 | 
					from .span_group import SpanGroup
 | 
				
			||||||
from .token import Token
 | 
					from .token import Token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"]
 | 
					__all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -42,7 +42,7 @@ class Doc:
 | 
				
			||||||
    user_hooks: Dict[str, Callable[..., Any]]
 | 
					    user_hooks: Dict[str, Callable[..., Any]]
 | 
				
			||||||
    user_token_hooks: Dict[str, Callable[..., Any]]
 | 
					    user_token_hooks: Dict[str, Callable[..., Any]]
 | 
				
			||||||
    user_span_hooks: Dict[str, Callable[..., Any]]
 | 
					    user_span_hooks: Dict[str, Callable[..., Any]]
 | 
				
			||||||
    tensor: np.ndarray[Any, np.dtype[np.float_]]
 | 
					    tensor: np.ndarray[Any, np.dtype[np.float64]]
 | 
				
			||||||
    user_data: Dict[str, Any]
 | 
					    user_data: Dict[str, Any]
 | 
				
			||||||
    has_unknown_spaces: bool
 | 
					    has_unknown_spaces: bool
 | 
				
			||||||
    _context: Any
 | 
					    _context: Any
 | 
				
			||||||
| 
						 | 
					@ -125,7 +125,7 @@ class Doc:
 | 
				
			||||||
        vector: Optional[Floats1d] = ...,
 | 
					        vector: Optional[Floats1d] = ...,
 | 
				
			||||||
        alignment_mode: str = ...,
 | 
					        alignment_mode: str = ...,
 | 
				
			||||||
        span_id: Union[int, str] = ...,
 | 
					        span_id: Union[int, str] = ...,
 | 
				
			||||||
    ) -> Span: ...
 | 
					    ) -> Optional[Span]: ...
 | 
				
			||||||
    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
 | 
					    def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def has_vector(self) -> bool: ...
 | 
					    def has_vector(self) -> bool: ...
 | 
				
			||||||
| 
						 | 
					@ -166,7 +166,7 @@ class Doc:
 | 
				
			||||||
    ) -> Doc: ...
 | 
					    ) -> Doc: ...
 | 
				
			||||||
    def to_array(
 | 
					    def to_array(
 | 
				
			||||||
        self, py_attr_ids: Union[int, str, List[Union[int, str]]]
 | 
					        self, py_attr_ids: Union[int, str, List[Union[int, str]]]
 | 
				
			||||||
    ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
 | 
					    ) -> np.ndarray[Any, np.dtype[np.float64]]: ...
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def from_docs(
 | 
					    def from_docs(
 | 
				
			||||||
        docs: List[Doc],
 | 
					        docs: List[Doc],
 | 
				
			||||||
| 
						 | 
					@ -179,15 +179,13 @@ class Doc:
 | 
				
			||||||
        self, path: Union[str, Path], *, exclude: Iterable[str] = ...
 | 
					        self, path: Union[str, Path], *, exclude: Iterable[str] = ...
 | 
				
			||||||
    ) -> None: ...
 | 
					    ) -> None: ...
 | 
				
			||||||
    def from_disk(
 | 
					    def from_disk(
 | 
				
			||||||
        self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ...
 | 
					        self, path: Union[str, Path], *, exclude: Iterable[str] = ...
 | 
				
			||||||
    ) -> Doc: ...
 | 
					    ) -> Doc: ...
 | 
				
			||||||
    def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
 | 
					    def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
 | 
				
			||||||
    def from_bytes(
 | 
					    def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ...
 | 
				
			||||||
        self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
 | 
					    def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ...
 | 
				
			||||||
    ) -> Doc: ...
 | 
					 | 
				
			||||||
    def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
 | 
					 | 
				
			||||||
    def from_dict(
 | 
					    def from_dict(
 | 
				
			||||||
        self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ...
 | 
					        self, msg: Dict[str, Any], *, exclude: Iterable[str] = ...
 | 
				
			||||||
    ) -> Doc: ...
 | 
					    ) -> Doc: ...
 | 
				
			||||||
    def extend_tensor(self, tensor: Floats2d) -> None: ...
 | 
					    def extend_tensor(self, tensor: Floats2d) -> None: ...
 | 
				
			||||||
    def retokenize(self) -> Retokenizer: ...
 | 
					    def retokenize(self) -> Retokenizer: ...
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1326,7 +1326,7 @@ cdef class Doc:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        path (str / Path): A path to a directory. Paths may be either
 | 
					        path (str / Path): A path to a directory. Paths may be either
 | 
				
			||||||
            strings or `Path`-like objects.
 | 
					            strings or `Path`-like objects.
 | 
				
			||||||
        exclude (list): String names of serialization fields to exclude.
 | 
					        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
				
			||||||
        RETURNS (Doc): The modified `Doc` object.
 | 
					        RETURNS (Doc): The modified `Doc` object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#from_disk
 | 
					        DOCS: https://spacy.io/api/doc#from_disk
 | 
				
			||||||
| 
						 | 
					@ -1339,7 +1339,7 @@ cdef class Doc:
 | 
				
			||||||
    def to_bytes(self, *, exclude=tuple()):
 | 
					    def to_bytes(self, *, exclude=tuple()):
 | 
				
			||||||
        """Serialize, i.e. export the document contents to a binary string.
 | 
					        """Serialize, i.e. export the document contents to a binary string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        exclude (list): String names of serialization fields to exclude.
 | 
					        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
				
			||||||
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 | 
					        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 | 
				
			||||||
            all annotations.
 | 
					            all annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1351,7 +1351,7 @@ cdef class Doc:
 | 
				
			||||||
        """Deserialize, i.e. import the document contents from a binary string.
 | 
					        """Deserialize, i.e. import the document contents from a binary string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data (bytes): The string to load from.
 | 
					        data (bytes): The string to load from.
 | 
				
			||||||
        exclude (list): String names of serialization fields to exclude.
 | 
					        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
				
			||||||
        RETURNS (Doc): Itself.
 | 
					        RETURNS (Doc): Itself.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#from_bytes
 | 
					        DOCS: https://spacy.io/api/doc#from_bytes
 | 
				
			||||||
| 
						 | 
					@ -1361,11 +1361,8 @@ cdef class Doc:
 | 
				
			||||||
    def to_dict(self, *, exclude=tuple()):
 | 
					    def to_dict(self, *, exclude=tuple()):
 | 
				
			||||||
        """Export the document contents to a dictionary for serialization.
 | 
					        """Export the document contents to a dictionary for serialization.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        exclude (list): String names of serialization fields to exclude.
 | 
					        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
				
			||||||
        RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
 | 
					        RETURNS (Dict[str, Any]): A dictionary representation of the `Doc`
 | 
				
			||||||
            all annotations.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#to_bytes
 | 
					 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        array_head = Doc._get_array_attrs()
 | 
					        array_head = Doc._get_array_attrs()
 | 
				
			||||||
        strings = set()
 | 
					        strings = set()
 | 
				
			||||||
| 
						 | 
					@ -1411,13 +1408,11 @@ cdef class Doc:
 | 
				
			||||||
        return util.to_dict(serializers, exclude)
 | 
					        return util.to_dict(serializers, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_dict(self, msg, *, exclude=tuple()):
 | 
					    def from_dict(self, msg, *, exclude=tuple()):
 | 
				
			||||||
        """Deserialize, i.e. import the document contents from a binary string.
 | 
					        """Deserialize the document contents from a dictionary representation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data (bytes): The string to load from.
 | 
					        msg (Dict[str, Any]): The dictionary to load from.
 | 
				
			||||||
        exclude (list): String names of serialization fields to exclude.
 | 
					        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
				
			||||||
        RETURNS (Doc): Itself.
 | 
					        RETURNS (Doc): Itself.
 | 
				
			||||||
 | 
					 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#from_dict
 | 
					 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if self.length != 0:
 | 
					        if self.length != 0:
 | 
				
			||||||
            raise ValueError(Errors.E033.format(length=self.length))
 | 
					            raise ValueError(Errors.E033.format(length=self.length))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -127,14 +127,17 @@ cdef class Span:
 | 
				
			||||||
        self._vector = vector
 | 
					        self._vector = vector
 | 
				
			||||||
        self._vector_norm = vector_norm
 | 
					        self._vector_norm = vector_norm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __richcmp__(self, Span other, int op):
 | 
					    def __richcmp__(self, object other, int op):
 | 
				
			||||||
        if other is None:
 | 
					        if other is None:
 | 
				
			||||||
            if op == 0 or op == 1 or op == 2:
 | 
					            if op == 0 or op == 1 or op == 2:
 | 
				
			||||||
                return False
 | 
					                return False
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return True
 | 
					                return True
 | 
				
			||||||
 | 
					        if not isinstance(other, Span):
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					        cdef Span other_span = other
 | 
				
			||||||
        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
 | 
					        self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
 | 
				
			||||||
        other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc)
 | 
					        other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc)
 | 
				
			||||||
        # <
 | 
					        # <
 | 
				
			||||||
        if op == 0:
 | 
					        if op == 0:
 | 
				
			||||||
            return self_tuple < other_tuple
 | 
					            return self_tuple < other_tuple
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,7 +53,12 @@ class Token:
 | 
				
			||||||
    def __bytes__(self) -> bytes: ...
 | 
					    def __bytes__(self) -> bytes: ...
 | 
				
			||||||
    def __str__(self) -> str: ...
 | 
					    def __str__(self) -> str: ...
 | 
				
			||||||
    def __repr__(self) -> str: ...
 | 
					    def __repr__(self) -> str: ...
 | 
				
			||||||
    def __richcmp__(self, other: Token, op: int) -> bool: ...
 | 
					    def __lt__(self, other: Any) -> bool: ...
 | 
				
			||||||
 | 
					    def __le__(self, other: Any) -> bool: ...
 | 
				
			||||||
 | 
					    def __eq__(self, other: Any) -> bool: ...
 | 
				
			||||||
 | 
					    def __ne__(self, other: Any) -> bool: ...
 | 
				
			||||||
 | 
					    def __gt__(self, other: Any) -> bool: ...
 | 
				
			||||||
 | 
					    def __ge__(self, other: Any) -> bool: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def _(self) -> Underscore: ...
 | 
					    def _(self) -> Underscore: ...
 | 
				
			||||||
    def nbor(self, i: int = ...) -> Token: ...
 | 
					    def nbor(self, i: int = ...) -> Token: ...
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -139,17 +139,20 @@ cdef class Token:
 | 
				
			||||||
    def __repr__(self):
 | 
					    def __repr__(self):
 | 
				
			||||||
        return self.__str__()
 | 
					        return self.__str__()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __richcmp__(self, Token other, int op):
 | 
					    def __richcmp__(self, object other, int op):
 | 
				
			||||||
        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
 | 
					        # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
 | 
				
			||||||
        if other is None:
 | 
					        if other is None:
 | 
				
			||||||
            if op in (0, 1, 2):
 | 
					            if op in (0, 1, 2):
 | 
				
			||||||
                return False
 | 
					                return False
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return True
 | 
					                return True
 | 
				
			||||||
 | 
					        if not isinstance(other, Token):
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					        cdef Token other_token = other
 | 
				
			||||||
        cdef Doc my_doc = self.doc
 | 
					        cdef Doc my_doc = self.doc
 | 
				
			||||||
        cdef Doc other_doc = other.doc
 | 
					        cdef Doc other_doc = other_token.doc
 | 
				
			||||||
        my = self.idx
 | 
					        my = self.idx
 | 
				
			||||||
        their = other.idx
 | 
					        their = other_token.idx
 | 
				
			||||||
        if op == 0:
 | 
					        if op == 0:
 | 
				
			||||||
            return my < their
 | 
					            return my < their
 | 
				
			||||||
        elif op == 2:
 | 
					        elif op == 2:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,3 +16,28 @@ from .iob_utils import (  # noqa: F401
 | 
				
			||||||
    tags_to_entities,
 | 
					    tags_to_entities,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from .loggers import console_logger  # noqa: F401
 | 
					from .loggers import console_logger  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__all__ = [
 | 
				
			||||||
 | 
					    "Alignment",
 | 
				
			||||||
 | 
					    "Corpus",
 | 
				
			||||||
 | 
					    "Example",
 | 
				
			||||||
 | 
					    "JsonlCorpus",
 | 
				
			||||||
 | 
					    "PlainTextCorpus",
 | 
				
			||||||
 | 
					    "biluo_tags_to_offsets",
 | 
				
			||||||
 | 
					    "biluo_tags_to_spans",
 | 
				
			||||||
 | 
					    "biluo_to_iob",
 | 
				
			||||||
 | 
					    "create_copy_from_base_model",
 | 
				
			||||||
 | 
					    "docs_to_json",
 | 
				
			||||||
 | 
					    "dont_augment",
 | 
				
			||||||
 | 
					    "iob_to_biluo",
 | 
				
			||||||
 | 
					    "minibatch_by_padded_size",
 | 
				
			||||||
 | 
					    "minibatch_by_words",
 | 
				
			||||||
 | 
					    "offsets_to_biluo_tags",
 | 
				
			||||||
 | 
					    "orth_variants_augmenter",
 | 
				
			||||||
 | 
					    "read_json_file",
 | 
				
			||||||
 | 
					    "remove_bilu_prefix",
 | 
				
			||||||
 | 
					    "split_bilu_label",
 | 
				
			||||||
 | 
					    "tags_to_entities",
 | 
				
			||||||
 | 
					    "validate_get_examples",
 | 
				
			||||||
 | 
					    "validate_examples",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1077,20 +1077,38 @@ def make_tempdir() -> Generator[Path, None, None]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_in_jupyter() -> bool:
 | 
					def is_in_jupyter() -> bool:
 | 
				
			||||||
    """Check if user is running spaCy from a Jupyter notebook by detecting the
 | 
					    """Check if user is running spaCy from a Jupyter or Colab notebook by
 | 
				
			||||||
    IPython kernel. Mainly used for the displaCy visualizer.
 | 
					    detecting the IPython kernel. Mainly used for the displaCy visualizer.
 | 
				
			||||||
    RETURNS (bool): True if in Jupyter, False if not.
 | 
					    RETURNS (bool): True if in Jupyter/Colab, False if not.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    # https://stackoverflow.com/a/39662359/6400719
 | 
					    # https://stackoverflow.com/a/39662359/6400719
 | 
				
			||||||
 | 
					    # https://stackoverflow.com/questions/15411967
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        shell = get_ipython().__class__.__name__  # type: ignore[name-defined]
 | 
					        if get_ipython().__class__.__name__ == "ZMQInteractiveShell":  # type: ignore[name-defined]
 | 
				
			||||||
        if shell == "ZMQInteractiveShell":
 | 
					 | 
				
			||||||
            return True  # Jupyter notebook or qtconsole
 | 
					            return True  # Jupyter notebook or qtconsole
 | 
				
			||||||
 | 
					        if get_ipython().__class__.__module__ == "google.colab._shell":  # type: ignore[name-defined]
 | 
				
			||||||
 | 
					            return True  # Colab notebook
 | 
				
			||||||
    except NameError:
 | 
					    except NameError:
 | 
				
			||||||
        return False  # Probably standard Python interpreter
 | 
					        pass  # Probably standard Python interpreter
 | 
				
			||||||
 | 
					    # additional check for Colab
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        import google.colab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return True  # Colab notebook
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
    return False
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def is_in_interactive() -> bool:
 | 
				
			||||||
 | 
					    """Check if user is running spaCy from an interactive Python
 | 
				
			||||||
 | 
					    shell. Will return True in Jupyter notebooks too.
 | 
				
			||||||
 | 
					    RETURNS (bool): True if in interactive mode, False if not.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # https://stackoverflow.com/questions/2356399/tell-if-python-is-in-interactive-mode
 | 
				
			||||||
 | 
					    return hasattr(sys, "ps1") or hasattr(sys, "ps2")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_object_name(obj: Any) -> str:
 | 
					def get_object_name(obj: Any) -> str:
 | 
				
			||||||
    """Get a human-readable name of a Python object, e.g. a pipeline component.
 | 
					    """Get a human-readable name of a Python object, e.g. a pipeline component.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,7 +79,7 @@ subword features, and a
 | 
				
			||||||
consisting of a CNN and a layer-normalized maxout activation function.
 | 
					consisting of a CNN and a layer-normalized maxout activation function.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                 | Description                                                                                                                                                                                                                                                                 |
 | 
					| Name                 | Description                                                                                                                                                                                                                                                                 |
 | 
				
			||||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                        |
 | 
					| `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                        |
 | 
				
			||||||
| `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                              |
 | 
					| `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                              |
 | 
				
			||||||
| `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                          |
 | 
					| `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                          |
 | 
				
			||||||
| 
						 | 
					@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
 | 
				
			||||||
> nO = null
 | 
					> nO = null
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> [model.linear_model]
 | 
					> [model.linear_model]
 | 
				
			||||||
> @architectures = "spacy.TextCatBOW.v2"
 | 
					> @architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
> exclusive_classes = true
 | 
					> exclusive_classes = true
 | 
				
			||||||
 | 
					> length = 262144
 | 
				
			||||||
> ngram_size = 1
 | 
					> ngram_size = 1
 | 
				
			||||||
> no_output_layer = false
 | 
					> no_output_layer = false
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -1017,54 +1018,15 @@ but used an internal `tok2vec` instead of taking it as argument:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Accordion>
 | 
					</Accordion>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.TextCatCNN.v2 {id="TextCatCNN"}
 | 
					### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example Config
 | 
					> #### Example Config
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```ini
 | 
					> ```ini
 | 
				
			||||||
> [model]
 | 
					> [model]
 | 
				
			||||||
> @architectures = "spacy.TextCatCNN.v2"
 | 
					> @architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
> exclusive_classes = false
 | 
					 | 
				
			||||||
> nO = null
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
> [model.tok2vec]
 | 
					 | 
				
			||||||
> @architectures = "spacy.HashEmbedCNN.v2"
 | 
					 | 
				
			||||||
> pretrained_vectors = null
 | 
					 | 
				
			||||||
> width = 96
 | 
					 | 
				
			||||||
> depth = 4
 | 
					 | 
				
			||||||
> embed_size = 2000
 | 
					 | 
				
			||||||
> window_size = 1
 | 
					 | 
				
			||||||
> maxout_pieces = 3
 | 
					 | 
				
			||||||
> subword_features = true
 | 
					 | 
				
			||||||
> ```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
A neural network model where token vectors are calculated using a CNN. The
 | 
					 | 
				
			||||||
vectors are mean pooled and used as features in a feed-forward network. This
 | 
					 | 
				
			||||||
architecture is usually less accurate than the ensemble, but runs faster.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| Name                | Description                                                                                                                                                                                    |
 | 
					 | 
				
			||||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					 | 
				
			||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
					 | 
				
			||||||
| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
 | 
					 | 
				
			||||||
| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
					 | 
				
			||||||
| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
 | 
					 | 
				
			||||||
not yet resizable. Since v2, new labels can be added to this component, even
 | 
					 | 
				
			||||||
after training.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</Accordion>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
> #### Example Config
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
> ```ini
 | 
					 | 
				
			||||||
> [model]
 | 
					 | 
				
			||||||
> @architectures = "spacy.TextCatBOW.v2"
 | 
					 | 
				
			||||||
> exclusive_classes = false
 | 
					> exclusive_classes = false
 | 
				
			||||||
 | 
					> length = 262144
 | 
				
			||||||
> ngram_size = 1
 | 
					> ngram_size = 1
 | 
				
			||||||
> no_output_layer = false
 | 
					> no_output_layer = false
 | 
				
			||||||
> nO = null
 | 
					> nO = null
 | 
				
			||||||
| 
						 | 
					@ -1078,17 +1040,108 @@ the others, but may not be as accurate, especially if texts are short.
 | 
				
			||||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
					| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
				
			||||||
| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
 | 
					| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
 | 
				
			||||||
| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
 | 
					| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
 | 
				
			||||||
 | 
					| `length`            | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~                                              |
 | 
				
			||||||
| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Accordion title="spacy.TextCatBOW.v1 definition" spaced>
 | 
					<Accordion title="Previous versions of spacy.TextCatBOW" spaced>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was
 | 
					- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
 | 
				
			||||||
not yet resizable. Since v2, new labels can be added to this component, even
 | 
					  new labels can be added to this component, even after training.
 | 
				
			||||||
after training.
 | 
					- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
 | 
				
			||||||
 | 
					  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
 | 
				
			||||||
 | 
					  layer that only used a small number of the allocated parameters.
 | 
				
			||||||
 | 
					- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
 | 
				
			||||||
 | 
					  [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Accordion>
 | 
					</Accordion>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example Config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model]
 | 
				
			||||||
 | 
					> @architectures = "spacy.TextCatParametricAttention.v1"
 | 
				
			||||||
 | 
					> exclusive_classes = true
 | 
				
			||||||
 | 
					> nO = null
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.tok2vec]
 | 
				
			||||||
 | 
					> @architectures = "spacy.Tok2Vec.v2"
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.tok2vec.embed]
 | 
				
			||||||
 | 
					> @architectures = "spacy.MultiHashEmbed.v2"
 | 
				
			||||||
 | 
					> width = 64
 | 
				
			||||||
 | 
					> rows = [2000, 2000, 1000, 1000, 1000, 1000]
 | 
				
			||||||
 | 
					> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
 | 
				
			||||||
 | 
					> include_static_vectors = false
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.tok2vec.encode]
 | 
				
			||||||
 | 
					> @architectures = "spacy.MaxoutWindowEncoder.v2"
 | 
				
			||||||
 | 
					> width = ${model.tok2vec.embed.width}
 | 
				
			||||||
 | 
					> window_size = 1
 | 
				
			||||||
 | 
					> maxout_pieces = 3
 | 
				
			||||||
 | 
					> depth = 2
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A neural network model that is built upon Tok2Vec and uses parametric attention
 | 
				
			||||||
 | 
					to attend to tokens that are relevant to text classification.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                | Description                                                                                                                                                                                    |
 | 
				
			||||||
 | 
					| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
 | 
				
			||||||
 | 
					| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
				
			||||||
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example Config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model]
 | 
				
			||||||
 | 
					> @architectures = "spacy.TextCatReduce.v1"
 | 
				
			||||||
 | 
					> exclusive_classes = false
 | 
				
			||||||
 | 
					> use_reduce_first = false
 | 
				
			||||||
 | 
					> use_reduce_last = false
 | 
				
			||||||
 | 
					> use_reduce_max = false
 | 
				
			||||||
 | 
					> use_reduce_mean = true
 | 
				
			||||||
 | 
					> nO = null
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.tok2vec]
 | 
				
			||||||
 | 
					> @architectures = "spacy.HashEmbedCNN.v2"
 | 
				
			||||||
 | 
					> pretrained_vectors = null
 | 
				
			||||||
 | 
					> width = 96
 | 
				
			||||||
 | 
					> depth = 4
 | 
				
			||||||
 | 
					> embed_size = 2000
 | 
				
			||||||
 | 
					> window_size = 1
 | 
				
			||||||
 | 
					> maxout_pieces = 3
 | 
				
			||||||
 | 
					> subword_features = true
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A classifier that pools token hidden representations of each `Doc` using first,
 | 
				
			||||||
 | 
					max or mean reduction and then applies a classification layer. Reductions are
 | 
				
			||||||
 | 
					concatenated when multiple reductions are used.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox variant="warning" title="Relation to TextCatCNN" id="TextCatCNN">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					`TextCatReduce` is a generalization of the older
 | 
				
			||||||
 | 
					[`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean
 | 
				
			||||||
 | 
					reduction, whereas `TextCatReduce` also supports first/max reductions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                | Description                                                                                                                                                                                    |
 | 
				
			||||||
 | 
					| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
				
			||||||
 | 
					| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
 | 
				
			||||||
 | 
					| `use_reduce_first`  | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~                                                                                                                |
 | 
				
			||||||
 | 
					| `use_reduce_last`   | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~                                                                                                                 |
 | 
				
			||||||
 | 
					| `use_reduce_max`    | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~                                                                                                           |
 | 
				
			||||||
 | 
					| `use_reduce_mean`   | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~                                                                                                                     |
 | 
				
			||||||
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}
 | 
					## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}
 | 
					### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1268,13 +1268,14 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
 | 
				
			||||||
warmed up before any measurements are taken.
 | 
					warmed up before any measurements are taken.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```cli
 | 
					```cli
 | 
				
			||||||
$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
 | 
					$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                 | Description                                                                                                                                                                          |
 | 
					| Name                 | Description                                                                                                                                                                          |
 | 
				
			||||||
| -------------------- | -------------------------------------------------------------------------------------------------------- |
 | 
					| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             |
 | 
					| `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             |
 | 
				
			||||||
| `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             |
 | 
					| `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             |
 | 
				
			||||||
 | 
					| `--code`, `-c`       | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | 
				
			||||||
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       |
 | 
					| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       |
 | 
				
			||||||
| `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      |
 | 
					| `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      |
 | 
				
			||||||
| `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | 
					| `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | 
				
			||||||
| 
						 | 
					@ -1296,6 +1297,9 @@ input formats are:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When a directory is provided it is traversed recursively to collect all files.
 | 
					When a directory is provided it is traversed recursively to collect all files.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
 | 
				
			||||||
 | 
					If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 | 
					$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned
 | 
				
			||||||
to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
 | 
					to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
 | 
				
			||||||
attribute.
 | 
					attribute.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> # Get the last hidden layer output for "is" (token index 1)
 | 
				
			||||||
 | 
					> doc = nlp("This is a text.")
 | 
				
			||||||
 | 
					> tensors = doc._.trf_data.last_hidden_layer_state[1]
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name              | Description                                                                                                                                                                        |
 | 
					| Name              | Description                                                                                                                                                                        |
 | 
				
			||||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
 | 
					| `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,10 +20,9 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible
 | 
				
			||||||
through a generic `llm`
 | 
					through a generic `llm`
 | 
				
			||||||
[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 | 
					[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
 | 
				
			||||||
as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
 | 
					as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
 | 
				
			||||||
`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and
 | 
					`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`,
 | 
				
			||||||
`llm_entity_linker`.
 | 
					`llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the
 | 
				
			||||||
 | 
					GPT-3-5 model from OpenAI is used by default, but this can be customized.
 | 
				
			||||||
### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -33,13 +32,18 @@ as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
 | 
				
			||||||
> llm = nlp.add_pipe("llm", config=config)
 | 
					> llm = nlp.add_pipe("llm", config=config)
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> # Construction via add_pipe with a task-specific factory and default GPT3.5 model
 | 
					> # Construction via add_pipe with a task-specific factory and default GPT3.5 model
 | 
				
			||||||
> llm = nlp.add_pipe("llm-ner")
 | 
					> llm = nlp.add_pipe("llm_ner")
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> # Construction via add_pipe with a task-specific factory and custom model
 | 
				
			||||||
 | 
					> llm = nlp.add_pipe("llm_ner", config={"model": {"@llm_models": "spacy.Dolly.v1", "name": "dolly-v2-12b"}})
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> # Construction from class
 | 
					> # Construction from class
 | 
				
			||||||
> from spacy_llm.pipeline import LLMWrapper
 | 
					> from spacy_llm.pipeline import LLMWrapper
 | 
				
			||||||
> llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True)
 | 
					> llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Create a new pipeline instance. In your application, you would normally use a
 | 
					Create a new pipeline instance. In your application, you would normally use a
 | 
				
			||||||
shortcut for this and instantiate the component using its string name and
 | 
					shortcut for this and instantiate the component using its string name and
 | 
				
			||||||
[`nlp.add_pipe`](/api/language#add_pipe).
 | 
					[`nlp.add_pipe`](/api/language#add_pipe).
 | 
				
			||||||
| 
						 | 
					@ -225,8 +229,8 @@ All tasks are registered in the `llm_tasks` registry.
 | 
				
			||||||
dataset across multiple storage units for easier processing and lookups. In
 | 
					dataset across multiple storage units for easier processing and lookups. In
 | 
				
			||||||
`spacy-llm` we use this term (synonymously: "mapping") to describe the splitting
 | 
					`spacy-llm` we use this term (synonymously: "mapping") to describe the splitting
 | 
				
			||||||
up of prompts if they are too long for a model to handle, and "fusing"
 | 
					up of prompts if they are too long for a model to handle, and "fusing"
 | 
				
			||||||
(synonymously: "reducing") to describe how the model responses for several shards
 | 
					(synonymously: "reducing") to describe how the model responses for several
 | 
				
			||||||
are merged back together into a single document.
 | 
					shards are merged back together into a single document.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Prompts are broken up in a manner that _always_ keeps the prompt in the template
 | 
					Prompts are broken up in a manner that _always_ keeps the prompt in the template
 | 
				
			||||||
intact, meaning that the instructions to the LLM will always stay complete. The
 | 
					intact, meaning that the instructions to the LLM will always stay complete. The
 | 
				
			||||||
| 
						 | 
					@ -1133,6 +1137,25 @@ supports `.yml`, `.yaml`, `.json` and `.jsonl`.
 | 
				
			||||||
path = "textcat_examples.json"
 | 
					path = "textcat_examples.json"
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you want to perform few-shot learning with a binary classifier (i. e. a text
 | 
				
			||||||
 | 
					either should or should not be assigned to a given class), you can provide
 | 
				
			||||||
 | 
					positive and negative examples with answers of "POS" or "NEG". "POS" means that
 | 
				
			||||||
 | 
					this example should be assigned the class label defined in the configuration,
 | 
				
			||||||
 | 
					"NEG" means it shouldn't. E. g. for spam classification:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					[
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "text": "You won the lottery! Wire a fee of 200$ to be able to withdraw your winnings.",
 | 
				
			||||||
 | 
					    "answer": "POS"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "text": "Your order #123456789 has arrived",
 | 
				
			||||||
 | 
					    "answer": "NEG"
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### REL {id="rel"}
 | 
					### REL {id="rel"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The REL task extracts relations between named entities.
 | 
					The REL task extracts relations between named entities.
 | 
				
			||||||
| 
						 | 
					@ -1358,13 +1381,14 @@ contain provider-specific keys and values, as it will be passed onwards to the
 | 
				
			||||||
provider's API.
 | 
					provider's API.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument           | Description                                                                                                                                                                    |
 | 
					| Argument           | Description                                                                                                                                                                    |
 | 
				
			||||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `name`             | Model name, i. e. any supported variant for this particular model. Default depends on the specific model (cf. below) ~~str~~                                                   |
 | 
					| `name`             | Model name, i. e. any supported variant for this particular model. Default depends on the specific model (cf. below) ~~str~~                                                   |
 | 
				
			||||||
| `config`           | Further configuration passed on to the model. Default depends on the specific model (cf. below). ~~Dict[Any, Any]~~                                                            |
 | 
					| `config`           | Further configuration passed on to the model. Default depends on the specific model (cf. below). ~~Dict[Any, Any]~~                                                            |
 | 
				
			||||||
| `strict`           | If `True`, raises an error if the LLM API returns a malformed response. Otherwise, return the error responses as is. Defaults to `True`. ~~bool~~                              |
 | 
					| `strict`           | If `True`, raises an error if the LLM API returns a malformed response. Otherwise, return the error responses as is. Defaults to `True`. ~~bool~~                              |
 | 
				
			||||||
| `max_tries`        | Max. number of tries for API request. Defaults to `5`. ~~int~~                                                                                                                 |
 | 
					| `max_tries`        | Max. number of tries for API request. Defaults to `5`. ~~int~~                                                                                                                 |
 | 
				
			||||||
| `max_request_time` | Max. time (in seconds) to wait for request to terminate before raising an exception. Defaults to `30.0`. ~~float~~                                                             |
 | 
					| `max_request_time` | Max. time (in seconds) to wait for request to terminate before raising an exception. Defaults to `30.0`. ~~float~~                                                             |
 | 
				
			||||||
| `interval`         | Time interval (in seconds) for API retries in seconds. Defaults to `1.0`. ~~float~~                                                                                            |
 | 
					| `interval`         | Time interval (in seconds) for API retries in seconds. Defaults to `1.0`. ~~float~~                                                                                            |
 | 
				
			||||||
 | 
					| `endpoint`         | Endpoint URL. Defaults to the provider's standard URL, if available (which is not the case for providers with exclusively custom deployments, such as Azure) ~~Optional[str]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example config:
 | 
					> #### Example config:
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -1483,7 +1507,7 @@ These models all take the same parameters:
 | 
				
			||||||
> ```ini
 | 
					> ```ini
 | 
				
			||||||
> [components.llm.model]
 | 
					> [components.llm.model]
 | 
				
			||||||
> @llm_models = "spacy.Llama2.v1"
 | 
					> @llm_models = "spacy.Llama2.v1"
 | 
				
			||||||
> name = "llama2-7b-hf"
 | 
					> name = "Llama-2-7b-hf"
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Currently, these models are provided as part of the core library:
 | 
					Currently, these models are provided as part of the core library:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -162,7 +162,10 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
 | 
					Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
 | 
				
			||||||
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
 | 
					that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
 | 
				
			||||||
yet support that.
 | 
					yet support that. `TextCatCNN` has been replaced by the more general
 | 
				
			||||||
 | 
					[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
 | 
				
			||||||
 | 
					identical to `TextCatReduce` with `use_reduce_mean=true`,
 | 
				
			||||||
 | 
					`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example Config
 | 
					> #### Example Config
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -194,11 +197,58 @@ architecture is usually less accurate than the ensemble, but runs faster.
 | 
				
			||||||
| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.TextCatCNN.v2 {id="TextCatCNN_v2"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example Config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model]
 | 
				
			||||||
 | 
					> @architectures = "spacy.TextCatCNN.v2"
 | 
				
			||||||
 | 
					> exclusive_classes = false
 | 
				
			||||||
 | 
					> nO = null
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> [model.tok2vec]
 | 
				
			||||||
 | 
					> @architectures = "spacy.HashEmbedCNN.v2"
 | 
				
			||||||
 | 
					> pretrained_vectors = null
 | 
				
			||||||
 | 
					> width = 96
 | 
				
			||||||
 | 
					> depth = 4
 | 
				
			||||||
 | 
					> embed_size = 2000
 | 
				
			||||||
 | 
					> window_size = 1
 | 
				
			||||||
 | 
					> maxout_pieces = 3
 | 
				
			||||||
 | 
					> subword_features = true
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A neural network model where token vectors are calculated using a CNN. The
 | 
				
			||||||
 | 
					vectors are mean pooled and used as features in a feed-forward network. This
 | 
				
			||||||
 | 
					architecture is usually less accurate than the ensemble, but runs faster.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					`TextCatCNN` has been replaced by the more general
 | 
				
			||||||
 | 
					[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
 | 
				
			||||||
 | 
					identical to `TextCatReduce` with `use_reduce_mean=true`,
 | 
				
			||||||
 | 
					`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                | Description                                                                                                                                                                                    |
 | 
				
			||||||
 | 
					| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
				
			||||||
 | 
					| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
 | 
				
			||||||
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
 | 
				
			||||||
 | 
					not yet resizable. Since v2, new labels can be added to this component, even
 | 
				
			||||||
 | 
					after training.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Accordion>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"}
 | 
					### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
 | 
					Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
 | 
				
			||||||
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
 | 
					that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
 | 
				
			||||||
yet support that.
 | 
					yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
 | 
				
			||||||
 | 
					erroneous sparse linear layer that only used a small number of the allocated
 | 
				
			||||||
 | 
					parameters.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example Config
 | 
					> #### Example Config
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -222,6 +272,33 @@ the others, but may not be as accurate, especially if texts are short.
 | 
				
			||||||
| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.TextCatBOW.v2 {id="TextCatBOW"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
 | 
				
			||||||
 | 
					linear layer that only used a small number of the allocated parameters.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example Config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model]
 | 
				
			||||||
 | 
					> @architectures = "spacy.TextCatBOW.v2"
 | 
				
			||||||
 | 
					> exclusive_classes = false
 | 
				
			||||||
 | 
					> ngram_size = 1
 | 
				
			||||||
 | 
					> no_output_layer = false
 | 
				
			||||||
 | 
					> nO = null
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					An n-gram "bag-of-words" model. This architecture should run much faster than
 | 
				
			||||||
 | 
					the others, but may not be as accurate, especially if texts are short.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name                | Description                                                                                                                                                                                    |
 | 
				
			||||||
 | 
					| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
 | 
					| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
 | 
				
			||||||
 | 
					| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
 | 
				
			||||||
 | 
					| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
 | 
				
			||||||
 | 
					| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
 | 
					| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 | 
					### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Identical to
 | 
					Identical to
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,6 +89,21 @@ architectures and their arguments and hyperparameters.
 | 
				
			||||||
| `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               |
 | 
					| `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               |
 | 
				
			||||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        |
 | 
					| `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox variant="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you set a non-default value for `spans_key`, you'll have to update
 | 
				
			||||||
 | 
					`[training.score_weights]` as well so that weights are computed properly. E. g.
 | 
				
			||||||
 | 
					for `spans_key == "myspankey"`, include this in your config:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```ini
 | 
				
			||||||
 | 
					[training.score_weights]
 | 
				
			||||||
 | 
					spans_myspankey_f = 1.0
 | 
				
			||||||
 | 
					spans_myspankey_p = 0.0
 | 
				
			||||||
 | 
					spans_myspankey_r = 0.0
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
 | 
					%%GITHUB_SPACY/spacy/pipeline/spancat.py
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -397,6 +397,17 @@ are wrapped into the
 | 
				
			||||||
by this class. Instances of this class are typically assigned to the
 | 
					by this class. Instances of this class are typically assigned to the
 | 
				
			||||||
[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
 | 
					[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```python
 | 
				
			||||||
 | 
					> # Get the last hidden layer output for "is" (token index 1)
 | 
				
			||||||
 | 
					> doc = nlp("This is a text.")
 | 
				
			||||||
 | 
					> indices = doc._.trf_data.align[1].data.flatten()
 | 
				
			||||||
 | 
					> last_hidden_state = doc._.trf_data.model_output.last_hidden_state
 | 
				
			||||||
 | 
					> dim = last_hidden_state.shape[-1]
 | 
				
			||||||
 | 
					> tensors = last_hidden_state.reshape(-1, dim)[indices]
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                                                                                                                                                                                                          |
 | 
					| Name           | Description                                                                                                                                                                                                                                                                                                                          |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `tokens`       | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                       |
 | 
					| `tokens`       | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                       |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ between `Doc` objects.
 | 
				
			||||||
<Infobox variant ="warning">
 | 
					<Infobox variant ="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Note that a `Vocab` instance is not static. It increases in size as texts with
 | 
					Note that a `Vocab` instance is not static. It increases in size as texts with
 | 
				
			||||||
new tokens are processed.
 | 
					new tokens are processed. Some models may have an empty vocab at initialization.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Infobox>
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -93,6 +93,7 @@ given string, you need to look it up in
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```python
 | 
					> ```python
 | 
				
			||||||
 | 
					> nlp("I'm eating an apple")
 | 
				
			||||||
> apple = nlp.vocab.strings["apple"]
 | 
					> apple = nlp.vocab.strings["apple"]
 | 
				
			||||||
> oov = nlp.vocab.strings["dskfodkfos"]
 | 
					> oov = nlp.vocab.strings["dskfodkfos"]
 | 
				
			||||||
> assert apple in nlp.vocab
 | 
					> assert apple in nlp.vocab
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
		 Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB  | 
| 
						 | 
					@ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### CNN/CPU pipelines with floret vectors
 | 
					#### CNN/CPU pipelines with floret vectors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The Finnish, Korean and Swedish `md` and `lg` pipelines use
 | 
					The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg`
 | 
				
			||||||
[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're
 | 
					pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors.
 | 
				
			||||||
running a trained pipeline on texts and working with [`Doc`](/api/doc) objects,
 | 
					If you're running a trained pipeline on texts and working with [`Doc`](/api/doc)
 | 
				
			||||||
you shouldn't notice any difference with floret vectors. With floret vectors no
 | 
					objects, you shouldn't notice any difference with floret vectors. With floret
 | 
				
			||||||
tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will
 | 
					vectors no tokens are out-of-vocabulary, so
 | 
				
			||||||
return `False` for all tokens.
 | 
					[`Token.is_oov`](/api/token#attributes) will return `False` for all tokens.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
If you access vectors directly for similarity comparisons, there are a few
 | 
					If you access vectors directly for similarity comparisons, there are a few
 | 
				
			||||||
differences because floret vectors don't include a fixed word list like the
 | 
					differences because floret vectors don't include a fixed word list like the
 | 
				
			||||||
| 
						 | 
					@ -132,10 +132,20 @@ vector keys for default vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Transformer pipeline design {id="design-trf"}
 | 
					### Transformer pipeline design {id="design-trf"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
 | 
					In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if
 | 
				
			||||||
all listen to the `transformer` component. The `attribute_ruler` and
 | 
					present) all listen to the `transformer` component. The `attribute_ruler` and
 | 
				
			||||||
`lemmatizer` have the same configuration as in the CNN models.
 | 
					`lemmatizer` have the same configuration as in the CNN models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For spaCy v3.0-v3.6, `trf` pipelines use
 | 
				
			||||||
 | 
					[`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the
 | 
				
			||||||
 | 
					transformer output in `doc._.trf_data` is a
 | 
				
			||||||
 | 
					[`TransformerData`](/api/transformer#transformerdata) object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For spaCy v3.7+, `trf` pipelines use
 | 
				
			||||||
 | 
					[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers)
 | 
				
			||||||
 | 
					and `doc._.trf_data` is a
 | 
				
			||||||
 | 
					[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Modifying the default pipeline {id="design-modify"}
 | 
					### Modifying the default pipeline {id="design-modify"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For faster processing, you may only want to run a subset of the components in a
 | 
					For faster processing, you may only want to run a subset of the components in a
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,8 +31,6 @@ for ent in doc.ents:
 | 
				
			||||||
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
 | 
					Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
 | 
				
			||||||
our example sentence and its named entities look like:
 | 
					our example sentence and its named entities look like:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={120}>
 | 
				
			||||||
  title="displaCy visualization of entities"
 | 
					<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}><mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is looking at buying <mark style={{ background: '#feca74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>U.K. <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>GPE</span></mark> startup for <mark style={{ background: '#e4e7d2', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>$1 billion <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>MONEY</span></mark></div>
 | 
				
			||||||
  src="/images/displacy-ent1.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={100}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -56,8 +56,7 @@ for token in doc:
 | 
				
			||||||
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
 | 
					Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
 | 
				
			||||||
our example sentence and its dependencies look like:
 | 
					our example sentence and its dependencies look like:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<ImageScrollable
 | 
				
			||||||
  title="displaCy visualization of dependencies and entities"
 | 
					  src="/images/displacy-long.svg"
 | 
				
			||||||
  src="/images/displacy-long.html"
 | 
					  width={1975}
 | 
				
			||||||
  height={450}
 | 
					 | 
				
			||||||
/>
 | 
					/>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -153,8 +153,9 @@ maxout_pieces = 3
 | 
				
			||||||
depth = 2
 | 
					depth = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.textcat.model.linear_model]
 | 
					[components.textcat.model.linear_model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
| 
						 | 
					@ -170,8 +171,9 @@ factory = "textcat"
 | 
				
			||||||
labels = []
 | 
					labels = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[components.textcat.model]
 | 
					[components.textcat.model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
nO = null
 | 
					nO = null
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -290,11 +290,7 @@ for token in doc:
 | 
				
			||||||
| toward        | `prep`     | shift     | `NOUN`   | manufacturers           |
 | 
					| toward        | `prep`     | shift     | `NOUN`   | manufacturers           |
 | 
				
			||||||
| manufacturers | `pobj`     | toward    | `ADP`    |                         |
 | 
					| manufacturers | `pobj`     | toward    | `ADP`    |                         |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<ImageScrollable src="/images/displacy-long2.svg" width={1275} />
 | 
				
			||||||
  title="displaCy visualization of dependencies and entities 2"
 | 
					 | 
				
			||||||
  src="/images/displacy-long2.html"
 | 
					 | 
				
			||||||
  height={450}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
Because the syntactic relations form a tree, every word has **exactly one
 | 
					Because the syntactic relations form a tree, every word has **exactly one
 | 
				
			||||||
head**. You can therefore iterate over the arcs in the tree by iterating over
 | 
					head**. You can therefore iterate over the arcs in the tree by iterating over
 | 
				
			||||||
| 
						 | 
					@ -709,11 +705,9 @@ doc = nlp(text)
 | 
				
			||||||
displacy.serve(doc, style="ent")
 | 
					displacy.serve(doc, style="ent")
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={180}>
 | 
				
			||||||
  title="displaCy visualizer for entities"
 | 
					<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div>
 | 
				
			||||||
  src="/images/displacy-ent2.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={180}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Entity Linking {id="entity-linking"}
 | 
					## Entity Linking {id="entity-linking"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -723,6 +717,10 @@ identifier from a knowledge base (KB). You can create your own
 | 
				
			||||||
[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
 | 
					[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
 | 
				
			||||||
[`EntityLinker`](/api/entitylinker) using that custom knowledge base.
 | 
					[`EntityLinker`](/api/entitylinker) using that custom knowledge base.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					As an example on how to define a KnowledgeBase and train an entity linker model,
 | 
				
			||||||
 | 
					see [`this tutorial`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson)
 | 
				
			||||||
 | 
					using [spaCy projects](/usage/projects).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"}
 | 
					### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The annotated KB identifier is accessible as either a hash value or as a string,
 | 
					The annotated KB identifier is accessible as either a hash value or as a string,
 | 
				
			||||||
| 
						 | 
					@ -733,6 +731,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
import spacy
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# "my_custom_el_pipeline" is assumed to be a custom NLP pipeline that was trained and serialized to disk
 | 
				
			||||||
nlp = spacy.load("my_custom_el_pipeline")
 | 
					nlp = spacy.load("my_custom_el_pipeline")
 | 
				
			||||||
doc = nlp("Ada Lovelace was born in London")
 | 
					doc = nlp("Ada Lovelace was born in London")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1328,8 +1328,9 @@ labels = []
 | 
				
			||||||
# This function is created and then passed to the "textcat" component as
 | 
					# This function is created and then passed to the "textcat" component as
 | 
				
			||||||
# the argument "model"
 | 
					# the argument "model"
 | 
				
			||||||
[components.textcat.model]
 | 
					[components.textcat.model]
 | 
				
			||||||
@architectures = "spacy.TextCatBOW.v2"
 | 
					@architectures = "spacy.TextCatBOW.v3"
 | 
				
			||||||
exclusive_classes = true
 | 
					exclusive_classes = true
 | 
				
			||||||
 | 
					length = 262144
 | 
				
			||||||
ngram_size = 1
 | 
					ngram_size = 1
 | 
				
			||||||
no_output_layer = false
 | 
					no_output_layer = false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1144,10 +1144,9 @@ relations and tokens we want to match:
 | 
				
			||||||
> displacy.serve(doc)
 | 
					> displacy.serve(doc)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<ImageScrollable
 | 
				
			||||||
  title="displaCy visualization of dependencies"
 | 
					  src="/images/displacy-dep-founded.svg"
 | 
				
			||||||
  src="/images/displacy-dep-founded.html"
 | 
					  width={925}
 | 
				
			||||||
  height={450}
 | 
					 | 
				
			||||||
/>
 | 
					/>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The relations we're interested in are:
 | 
					The relations we're interested in are:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -405,7 +405,7 @@ available to spaCy, all you need to do is install the package in your
 | 
				
			||||||
environment:
 | 
					environment:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
$ python setup.py develop
 | 
					$ python -m pip install .
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy is now able to create the pipeline component `"snek"` – even though you
 | 
					spaCy is now able to create the pipeline component `"snek"` – even though you
 | 
				
			||||||
| 
						 | 
					@ -586,11 +586,9 @@ After installing the package, the custom colors will be used when visualizing
 | 
				
			||||||
text with `displacy`. Whenever the label `SNEK` is assigned, it will be
 | 
					text with `displacy`. Whenever the label `SNEK` is assigned, it will be
 | 
				
			||||||
displayed in `#3dff74`.
 | 
					displayed in `#3dff74`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={100}>
 | 
				
			||||||
  title="displaCy visualization of entities"
 | 
					<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>🌱🌿 <mark style={{ background: '#3dff74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>🐍 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>SNEK</span></mark> ____ 🌳🌲 ____ <mark style={{ background: '#cfc5ff', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>👨🌾 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>HUMAN</span></mark> 🏘️</div>
 | 
				
			||||||
  src="/images/displacy-ent-snek.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={100}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Saving, loading and distributing trained pipelines {id="models"}
 | 
					## Saving, loading and distributing trained pipelines {id="models"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -675,7 +673,7 @@ $ python -m spacy package ./en_example_pipeline ./packages
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This command will create a pipeline package directory and will run
 | 
					This command will create a pipeline package directory and will run
 | 
				
			||||||
`python setup.py sdist` in that directory to create a binary `.whl` file or
 | 
					`python -m build` in that directory to create a binary `.whl` file or
 | 
				
			||||||
`.tar.gz` archive of your package that can be installed using `pip install`.
 | 
					`.tar.gz` archive of your package that can be installed using `pip install`.
 | 
				
			||||||
Installing the binary wheel is usually more efficient.
 | 
					Installing the binary wheel is usually more efficient.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,11 +77,9 @@ doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
 | 
				
			||||||
displacy.serve(doc, style="span", options={"spans_key": "custom"})
 | 
					displacy.serve(doc, style="span", options={"spans_key": "custom"})
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={100}>
 | 
				
			||||||
  title="displaCy visualizer for overlapping spans"
 | 
					<div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div>
 | 
				
			||||||
  src="/images/displacy-span.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={180}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Additional features and improvements
 | 
					## Additional features and improvements
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -119,11 +119,9 @@ doc = nlp(text)
 | 
				
			||||||
displacy.serve(doc, style="ent")
 | 
					displacy.serve(doc, style="ent")
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={180}>
 | 
				
			||||||
  title="displaCy visualizer for entities"
 | 
					<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div>
 | 
				
			||||||
  src="/images/displacy-ent2.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={180}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
The entity visualizer lets you customize the following `options`:
 | 
					The entity visualizer lets you customize the following `options`:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -148,11 +146,9 @@ use the `colors` setting to add your own colors for them.
 | 
				
			||||||
> displacy.serve(doc, style="ent", options=options)
 | 
					> displacy.serve(doc, style="ent", options=options)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={225}>
 | 
				
			||||||
  title="displaCy visualizer for entities (custom styling)"
 | 
					<div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>But <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is starting from behind. The company made a late push into hardware, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Siri, available on iPhones, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Amazon <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.</div>
 | 
				
			||||||
  src="/images/displacy-ent-custom.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={225}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
The above example uses a little trick: Since the background color values are
 | 
					The above example uses a little trick: Since the background color values are
 | 
				
			||||||
added as the `background` style attribute, you can use any
 | 
					added as the `background` style attribute, you can use any
 | 
				
			||||||
| 
						 | 
					@ -197,11 +193,9 @@ doc.spans["sc"] = [
 | 
				
			||||||
displacy.serve(doc, style="span")
 | 
					displacy.serve(doc, style="span")
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={100}>
 | 
				
			||||||
  title="displaCy visualizer for overlapping spans"
 | 
					<div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div>
 | 
				
			||||||
  src="/images/displacy-span.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={180}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
The span visualizer lets you customize the following `options`:
 | 
					The span visualizer lets you customize the following `options`:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -223,11 +217,9 @@ specify which one displaCy should use with `spans_key` (`sc` is the default).
 | 
				
			||||||
> displacy.serve(doc, style="span", options=options)
 | 
					> displacy.serve(doc, style="span", options=options)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<Iframe
 | 
					<Standalone height={100}>
 | 
				
			||||||
  title="displaCy visualizer for spans (custom spans_key)"
 | 
					<div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#ddd', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#ddd', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>BANK</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span>.</div>
 | 
				
			||||||
  src="/images/displacy-span-custom.html"
 | 
					</Standalone>
 | 
				
			||||||
  height={225}
 | 
					 | 
				
			||||||
/>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Using displaCy in Jupyter notebooks {id="jupyter"}
 | 
					## Using displaCy in Jupyter notebooks {id="jupyter"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -103,6 +103,10 @@
 | 
				
			||||||
            "has_examples": true,
 | 
					            "has_examples": true,
 | 
				
			||||||
            "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
 | 
					            "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "code": "fo",
 | 
				
			||||||
 | 
					            "name": "Faroese"
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "code": "fr",
 | 
					            "code": "fr",
 | 
				
			||||||
            "name": "French",
 | 
					            "name": "French",
 | 
				
			||||||
| 
						 | 
					@ -290,6 +294,12 @@
 | 
				
			||||||
            "example": "Dit is een zin.",
 | 
					            "example": "Dit is een zin.",
 | 
				
			||||||
            "has_examples": true
 | 
					            "has_examples": true
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "code": "nn",
 | 
				
			||||||
 | 
					            "name": "Norwegian Nynorsk",
 | 
				
			||||||
 | 
					            "example": "Det er ein meir enn i same periode i fjor.",
 | 
				
			||||||
 | 
					            "has_examples": true
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            "code": "pl",
 | 
					            "code": "pl",
 | 
				
			||||||
            "name": "Polish",
 | 
					            "name": "Polish",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,14 +9,9 @@
 | 
				
			||||||
                    { "text": "Models & Languages", "url": "/usage/models" },
 | 
					                    { "text": "Models & Languages", "url": "/usage/models" },
 | 
				
			||||||
                    { "text": "Facts & Figures", "url": "/usage/facts-figures" },
 | 
					                    { "text": "Facts & Figures", "url": "/usage/facts-figures" },
 | 
				
			||||||
                    { "text": "spaCy 101", "url": "/usage/spacy-101" },
 | 
					                    { "text": "spaCy 101", "url": "/usage/spacy-101" },
 | 
				
			||||||
                    { "text": "New in v3.0", "url": "/usage/v3" },
 | 
					                    { "text": "New in v3.7", "url": "/usage/v3-7" },
 | 
				
			||||||
                    { "text": "New in v3.1", "url": "/usage/v3-1" },
 | 
					 | 
				
			||||||
                    { "text": "New in v3.2", "url": "/usage/v3-2" },
 | 
					 | 
				
			||||||
                    { "text": "New in v3.3", "url": "/usage/v3-3" },
 | 
					 | 
				
			||||||
                    { "text": "New in v3.4", "url": "/usage/v3-4" },
 | 
					 | 
				
			||||||
                    { "text": "New in v3.5", "url": "/usage/v3-5" },
 | 
					 | 
				
			||||||
                    { "text": "New in v3.6", "url": "/usage/v3-6" },
 | 
					                    { "text": "New in v3.6", "url": "/usage/v3-6" },
 | 
				
			||||||
                    { "text": "New in v3.7", "url": "/usage/v3-7" }
 | 
					                    { "text": "New in v3.5", "url": "/usage/v3-5" }
 | 
				
			||||||
                ]
 | 
					                ]
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -66,6 +66,10 @@
 | 
				
			||||||
                {
 | 
					                {
 | 
				
			||||||
                    "text": "Stack Overflow",
 | 
					                    "text": "Stack Overflow",
 | 
				
			||||||
                    "url": "http://stackoverflow.com/questions/tagged/spacy"
 | 
					                    "url": "http://stackoverflow.com/questions/tagged/spacy"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                    "text": "Merchandise",
 | 
				
			||||||
 | 
					                    "url": "https://explosion.ai/merch"
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4500,6 +4500,23 @@
 | 
				
			||||||
                "website": "https://nlp.unibuc.ro/people/snisioi.html"
 | 
					                "website": "https://nlp.unibuc.ro/people/snisioi.html"
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
            "category": ["pipeline", "training", "models"]
 | 
					            "category": ["pipeline", "training", "models"]
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "id": "redfield-spacy-nodes",
 | 
				
			||||||
 | 
					            "title": "Redfield NLP Nodes for KNIME",
 | 
				
			||||||
 | 
					            "slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.",
 | 
				
			||||||
 | 
					            "description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).",
 | 
				
			||||||
 | 
					            "github": "Redfield-AB/Spacy-Nodes",
 | 
				
			||||||
 | 
					            "url": "https://redfield.ai/spacy-redfield/",
 | 
				
			||||||
 | 
					            "thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png",
 | 
				
			||||||
 | 
					            "image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png",
 | 
				
			||||||
 | 
					            "author": "Redfield AB",
 | 
				
			||||||
 | 
					            "author_links": {
 | 
				
			||||||
 | 
					                "twitter": "Redfield_AB",
 | 
				
			||||||
 | 
					                "github": "Redfield-AB",
 | 
				
			||||||
 | 
					                "website": "https://redfield.ai"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "category": ["standalone"]
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
		 Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 5.1 KiB  | 
| 
						 | 
					@ -1,80 +0,0 @@
 | 
				
			||||||
<div
 | 
					 | 
				
			||||||
    class="entities"
 | 
					 | 
				
			||||||
    style="
 | 
					 | 
				
			||||||
        line-height: 2.5;
 | 
					 | 
				
			||||||
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
 | 
					 | 
				
			||||||
            'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
 | 
					 | 
				
			||||||
        font-size: 18px;
 | 
					 | 
				
			||||||
    "
 | 
					 | 
				
			||||||
    >But
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
        >Google
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >ORG</span
 | 
					 | 
				
			||||||
        ></mark
 | 
					 | 
				
			||||||
    >is starting from behind. The company made a late push into hardware, and
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
        >Apple
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >ORG</span
 | 
					 | 
				
			||||||
        ></mark
 | 
					 | 
				
			||||||
    >’s Siri, available on iPhones, and
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
        >Amazon
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >ORG</span
 | 
					 | 
				
			||||||
        ></mark
 | 
					 | 
				
			||||||
    >’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer
 | 
					 | 
				
			||||||
    adoption.</div
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,59 +0,0 @@
 | 
				
			||||||
<div
 | 
					 | 
				
			||||||
    class="entities"
 | 
					 | 
				
			||||||
    style="
 | 
					 | 
				
			||||||
        line-height: 2.5;
 | 
					 | 
				
			||||||
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
 | 
					 | 
				
			||||||
            'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
 | 
					 | 
				
			||||||
        font-size: 16px;
 | 
					 | 
				
			||||||
    "
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
    🌱🌿
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #3dff74;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
        >🐍
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >SNEK</span
 | 
					 | 
				
			||||||
        ></mark
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
    ____ 🌳🌲 ____
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #cfc5ff;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
        >👨🌾
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >HUMAN</span
 | 
					 | 
				
			||||||
        ></mark
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
    🏘️
 | 
					 | 
				
			||||||
</div>
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,84 +0,0 @@
 | 
				
			||||||
<div
 | 
					 | 
				
			||||||
    class="entities"
 | 
					 | 
				
			||||||
    style="
 | 
					 | 
				
			||||||
        line-height: 2.5;
 | 
					 | 
				
			||||||
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
 | 
					 | 
				
			||||||
            'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
 | 
					 | 
				
			||||||
        font-size: 16px;
 | 
					 | 
				
			||||||
    "
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #7aecec;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
        Apple
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >ORG</span
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
    </mark>
 | 
					 | 
				
			||||||
    is looking at buying
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #feca74;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
        U.K.
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >GPE</span
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
    </mark>
 | 
					 | 
				
			||||||
    startup for
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #e4e7d2;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
        $1 billion
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >MONEY</span
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
    </mark>
 | 
					 | 
				
			||||||
</div>
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,86 +0,0 @@
 | 
				
			||||||
<div
 | 
					 | 
				
			||||||
    class="entities"
 | 
					 | 
				
			||||||
    style="
 | 
					 | 
				
			||||||
        line-height: 2.5;
 | 
					 | 
				
			||||||
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
 | 
					 | 
				
			||||||
            'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
 | 
					 | 
				
			||||||
        font-size: 18px;
 | 
					 | 
				
			||||||
    "
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
    When
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #aa9cfc;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
        Sebastian Thrun
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >PERSON</span
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
    </mark>
 | 
					 | 
				
			||||||
    started working on self-driving cars at
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #7aecec;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
        Google
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >ORG</span
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
    </mark>
 | 
					 | 
				
			||||||
    in
 | 
					 | 
				
			||||||
    <mark
 | 
					 | 
				
			||||||
        class="entity"
 | 
					 | 
				
			||||||
        style="
 | 
					 | 
				
			||||||
            background: #bfe1d9;
 | 
					 | 
				
			||||||
            padding: 0.45em 0.6em;
 | 
					 | 
				
			||||||
            margin: 0 0.25em;
 | 
					 | 
				
			||||||
            line-height: 1;
 | 
					 | 
				
			||||||
            border-radius: 0.35em;
 | 
					 | 
				
			||||||
        "
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
        2007
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                font-size: 0.8em;
 | 
					 | 
				
			||||||
                font-weight: bold;
 | 
					 | 
				
			||||||
                line-height: 1;
 | 
					 | 
				
			||||||
                border-radius: 0.35em;
 | 
					 | 
				
			||||||
                text-transform: uppercase;
 | 
					 | 
				
			||||||
                vertical-align: middle;
 | 
					 | 
				
			||||||
                margin-left: 0.5rem;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
            >DATE</span
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
    </mark>
 | 
					 | 
				
			||||||
    , few people outside of the company took him seriously.
 | 
					 | 
				
			||||||
</div>
 | 
					 | 
				
			||||||
| 
		 Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB  | 
							
								
								
									
										212
									
								
								website/public/images/displacy-long2.svg
									
									
									
									
									
										Normal file
									
								
							
							
						
						| 
						 | 
					@ -0,0 +1,212 @@
 | 
				
			||||||
 | 
					<svg
 | 
				
			||||||
 | 
					    xmlns="http://www.w3.org/2000/svg"
 | 
				
			||||||
 | 
					    xmlns:xlink="http://www.w3.org/1999/xlink"
 | 
				
			||||||
 | 
					    id="0"
 | 
				
			||||||
 | 
					    class="displacy"
 | 
				
			||||||
 | 
					    width="1275"
 | 
				
			||||||
 | 
					    height="399.5"
 | 
				
			||||||
 | 
					    style="
 | 
				
			||||||
 | 
					        max-width: none;
 | 
				
			||||||
 | 
					        height: 399.5px;
 | 
				
			||||||
 | 
					        color: #000000;
 | 
				
			||||||
 | 
					        background: #ffffff;
 | 
				
			||||||
 | 
					        font-family: Arial;
 | 
				
			||||||
 | 
					    "
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="50">Autonomous</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="225">cars</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="400">shift</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">VERB</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="575">insurance</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="750">liability</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">NOUN</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="925">toward</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">ADP</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
 | 
				
			||||||
 | 
					        <tspan class="displacy-word" fill="currentColor" x="1100">manufacturers</tspan>
 | 
				
			||||||
 | 
					        <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">NOUN</tspan>
 | 
				
			||||||
 | 
					    </text>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <g class="displacy-arrow">
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arc"
 | 
				
			||||||
 | 
					            id="arrow-0-0"
 | 
				
			||||||
 | 
					            stroke-width="2px"
 | 
				
			||||||
 | 
					            d="M70,264.5 C70,177.0 215.0,177.0 215.0,264.5"
 | 
				
			||||||
 | 
					            fill="none"
 | 
				
			||||||
 | 
					            stroke="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					        <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 | 
				
			||||||
 | 
					            <textpath
 | 
				
			||||||
 | 
					                xlink:href="#arrow-0-0"
 | 
				
			||||||
 | 
					                class="displacy-label"
 | 
				
			||||||
 | 
					                startOffset="50%"
 | 
				
			||||||
 | 
					                fill="currentColor"
 | 
				
			||||||
 | 
					                text-anchor="middle"
 | 
				
			||||||
 | 
					            >
 | 
				
			||||||
 | 
					                amod
 | 
				
			||||||
 | 
					            </textpath>
 | 
				
			||||||
 | 
					        </text>
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arrowhead"
 | 
				
			||||||
 | 
					            d="M70,266.5 L62,254.5 78,254.5"
 | 
				
			||||||
 | 
					            fill="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					    </g>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <g class="displacy-arrow">
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arc"
 | 
				
			||||||
 | 
					            id="arrow-0-1"
 | 
				
			||||||
 | 
					            stroke-width="2px"
 | 
				
			||||||
 | 
					            d="M245,264.5 C245,177.0 390.0,177.0 390.0,264.5"
 | 
				
			||||||
 | 
					            fill="none"
 | 
				
			||||||
 | 
					            stroke="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					        <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 | 
				
			||||||
 | 
					            <textpath
 | 
				
			||||||
 | 
					                xlink:href="#arrow-0-1"
 | 
				
			||||||
 | 
					                class="displacy-label"
 | 
				
			||||||
 | 
					                startOffset="50%"
 | 
				
			||||||
 | 
					                fill="currentColor"
 | 
				
			||||||
 | 
					                text-anchor="middle"
 | 
				
			||||||
 | 
					            >
 | 
				
			||||||
 | 
					                nsubj
 | 
				
			||||||
 | 
					            </textpath>
 | 
				
			||||||
 | 
					        </text>
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arrowhead"
 | 
				
			||||||
 | 
					            d="M245,266.5 L237,254.5 253,254.5"
 | 
				
			||||||
 | 
					            fill="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					    </g>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <g class="displacy-arrow">
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arc"
 | 
				
			||||||
 | 
					            id="arrow-0-2"
 | 
				
			||||||
 | 
					            stroke-width="2px"
 | 
				
			||||||
 | 
					            d="M595,264.5 C595,177.0 740.0,177.0 740.0,264.5"
 | 
				
			||||||
 | 
					            fill="none"
 | 
				
			||||||
 | 
					            stroke="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					        <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 | 
				
			||||||
 | 
					            <textpath
 | 
				
			||||||
 | 
					                xlink:href="#arrow-0-2"
 | 
				
			||||||
 | 
					                class="displacy-label"
 | 
				
			||||||
 | 
					                startOffset="50%"
 | 
				
			||||||
 | 
					                fill="currentColor"
 | 
				
			||||||
 | 
					                text-anchor="middle"
 | 
				
			||||||
 | 
					            >
 | 
				
			||||||
 | 
					                compound
 | 
				
			||||||
 | 
					            </textpath>
 | 
				
			||||||
 | 
					        </text>
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arrowhead"
 | 
				
			||||||
 | 
					            d="M595,266.5 L587,254.5 603,254.5"
 | 
				
			||||||
 | 
					            fill="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					    </g>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <g class="displacy-arrow">
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arc"
 | 
				
			||||||
 | 
					            id="arrow-0-3"
 | 
				
			||||||
 | 
					            stroke-width="2px"
 | 
				
			||||||
 | 
					            d="M420,264.5 C420,89.5 745.0,89.5 745.0,264.5"
 | 
				
			||||||
 | 
					            fill="none"
 | 
				
			||||||
 | 
					            stroke="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					        <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 | 
				
			||||||
 | 
					            <textpath
 | 
				
			||||||
 | 
					                xlink:href="#arrow-0-3"
 | 
				
			||||||
 | 
					                class="displacy-label"
 | 
				
			||||||
 | 
					                startOffset="50%"
 | 
				
			||||||
 | 
					                fill="currentColor"
 | 
				
			||||||
 | 
					                text-anchor="middle"
 | 
				
			||||||
 | 
					            >
 | 
				
			||||||
 | 
					                dobj
 | 
				
			||||||
 | 
					            </textpath>
 | 
				
			||||||
 | 
					        </text>
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arrowhead"
 | 
				
			||||||
 | 
					            d="M745.0,266.5 L753.0,254.5 737.0,254.5"
 | 
				
			||||||
 | 
					            fill="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					    </g>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <g class="displacy-arrow">
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arc"
 | 
				
			||||||
 | 
					            id="arrow-0-4"
 | 
				
			||||||
 | 
					            stroke-width="2px"
 | 
				
			||||||
 | 
					            d="M420,264.5 C420,2.0 925.0,2.0 925.0,264.5"
 | 
				
			||||||
 | 
					            fill="none"
 | 
				
			||||||
 | 
					            stroke="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					        <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 | 
				
			||||||
 | 
					            <textpath
 | 
				
			||||||
 | 
					                xlink:href="#arrow-0-4"
 | 
				
			||||||
 | 
					                class="displacy-label"
 | 
				
			||||||
 | 
					                startOffset="50%"
 | 
				
			||||||
 | 
					                fill="currentColor"
 | 
				
			||||||
 | 
					                text-anchor="middle"
 | 
				
			||||||
 | 
					            >
 | 
				
			||||||
 | 
					                prep
 | 
				
			||||||
 | 
					            </textpath>
 | 
				
			||||||
 | 
					        </text>
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arrowhead"
 | 
				
			||||||
 | 
					            d="M925.0,266.5 L933.0,254.5 917.0,254.5"
 | 
				
			||||||
 | 
					            fill="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					    </g>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    <g class="displacy-arrow">
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arc"
 | 
				
			||||||
 | 
					            id="arrow-0-5"
 | 
				
			||||||
 | 
					            stroke-width="2px"
 | 
				
			||||||
 | 
					            d="M945,264.5 C945,177.0 1090.0,177.0 1090.0,264.5"
 | 
				
			||||||
 | 
					            fill="none"
 | 
				
			||||||
 | 
					            stroke="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					        <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 | 
				
			||||||
 | 
					            <textpath
 | 
				
			||||||
 | 
					                xlink:href="#arrow-0-5"
 | 
				
			||||||
 | 
					                class="displacy-label"
 | 
				
			||||||
 | 
					                startOffset="50%"
 | 
				
			||||||
 | 
					                fill="currentColor"
 | 
				
			||||||
 | 
					                text-anchor="middle"
 | 
				
			||||||
 | 
					            >
 | 
				
			||||||
 | 
					                pobj
 | 
				
			||||||
 | 
					            </textpath>
 | 
				
			||||||
 | 
					        </text>
 | 
				
			||||||
 | 
					        <path
 | 
				
			||||||
 | 
					            class="displacy-arrowhead"
 | 
				
			||||||
 | 
					            d="M1090.0,266.5 L1098.0,254.5 1082.0,254.5"
 | 
				
			||||||
 | 
					            fill="currentColor"
 | 
				
			||||||
 | 
					        ></path>
 | 
				
			||||||
 | 
					    </g>
 | 
				
			||||||
 | 
					</svg>
 | 
				
			||||||
| 
		 After Width: | Height: | Size: 6.8 KiB  | 
| 
						 | 
					@ -1,84 +0,0 @@
 | 
				
			||||||
<div
 | 
					 | 
				
			||||||
    class="spans"
 | 
					 | 
				
			||||||
    style="
 | 
					 | 
				
			||||||
        line-height: 2.5;
 | 
					 | 
				
			||||||
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
 | 
					 | 
				
			||||||
            'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
 | 
					 | 
				
			||||||
        font-size: 18px;
 | 
					 | 
				
			||||||
        direction: ltr;
 | 
					 | 
				
			||||||
    "
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
    Welcome to the
 | 
					 | 
				
			||||||
    <span style="font-weight: bold; display: inline-block; position: relative">
 | 
					 | 
				
			||||||
        Bank
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #ddd;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #ddd;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                border-top-left-radius: 3px;
 | 
					 | 
				
			||||||
                border-bottom-left-radius: 3px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
            <span
 | 
					 | 
				
			||||||
                style="
 | 
					 | 
				
			||||||
                    background: #ddd;
 | 
					 | 
				
			||||||
                    color: #000;
 | 
					 | 
				
			||||||
                    top: -0.5em;
 | 
					 | 
				
			||||||
                    padding: 2px 3px;
 | 
					 | 
				
			||||||
                    position: absolute;
 | 
					 | 
				
			||||||
                    font-size: 0.6em;
 | 
					 | 
				
			||||||
                    font-weight: bold;
 | 
					 | 
				
			||||||
                    line-height: 1;
 | 
					 | 
				
			||||||
                    border-radius: 3px;
 | 
					 | 
				
			||||||
                "
 | 
					 | 
				
			||||||
            >
 | 
					 | 
				
			||||||
                BANK
 | 
					 | 
				
			||||||
            </span>
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
    </span>
 | 
					 | 
				
			||||||
    <span style="font-weight: bold; display: inline-block; position: relative">
 | 
					 | 
				
			||||||
        of
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #ddd;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
    </span>
 | 
					 | 
				
			||||||
    <span style="font-weight: bold; display: inline-block; position: relative">
 | 
					 | 
				
			||||||
        China
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #ddd;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
    </span>
 | 
					 | 
				
			||||||
    .
 | 
					 | 
				
			||||||
</div>
 | 
					 | 
				
			||||||
| 
						 | 
					@ -1,123 +0,0 @@
 | 
				
			||||||
<div
 | 
					 | 
				
			||||||
    class="spans"
 | 
					 | 
				
			||||||
    style="
 | 
					 | 
				
			||||||
        line-height: 2.5;
 | 
					 | 
				
			||||||
        direction: ltr;
 | 
					 | 
				
			||||||
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
 | 
					 | 
				
			||||||
            'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
 | 
					 | 
				
			||||||
        font-size: 18px;
 | 
					 | 
				
			||||||
    "
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
    Welcome to the
 | 
					 | 
				
			||||||
    <span style="font-weight: bold; display: inline-block; position: relative">
 | 
					 | 
				
			||||||
        Bank
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #7aecec;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #7aecec;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                border-top-left-radius: 3px;
 | 
					 | 
				
			||||||
                border-bottom-left-radius: 3px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
            <span
 | 
					 | 
				
			||||||
                style="
 | 
					 | 
				
			||||||
                    background: #7aecec;
 | 
					 | 
				
			||||||
                    color: #000;
 | 
					 | 
				
			||||||
                    top: -0.5em;
 | 
					 | 
				
			||||||
                    padding: 2px 3px;
 | 
					 | 
				
			||||||
                    position: absolute;
 | 
					 | 
				
			||||||
                    font-size: 0.6em;
 | 
					 | 
				
			||||||
                    font-weight: bold;
 | 
					 | 
				
			||||||
                    line-height: 1;
 | 
					 | 
				
			||||||
                    border-radius: 3px;
 | 
					 | 
				
			||||||
                "
 | 
					 | 
				
			||||||
            >
 | 
					 | 
				
			||||||
                ORG
 | 
					 | 
				
			||||||
            </span>
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
    </span>
 | 
					 | 
				
			||||||
    <span style="font-weight: bold; display: inline-block; position: relative">
 | 
					 | 
				
			||||||
        of
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #7aecec;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
    </span>
 | 
					 | 
				
			||||||
    <span style="font-weight: bold; display: inline-block; position: relative">
 | 
					 | 
				
			||||||
        China
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #7aecec;
 | 
					 | 
				
			||||||
                top: 40px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #feca74;
 | 
					 | 
				
			||||||
                top: 57px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
        <span
 | 
					 | 
				
			||||||
            style="
 | 
					 | 
				
			||||||
                background: #feca74;
 | 
					 | 
				
			||||||
                top: 57px;
 | 
					 | 
				
			||||||
                height: 4px;
 | 
					 | 
				
			||||||
                border-top-left-radius: 3px;
 | 
					 | 
				
			||||||
                border-bottom-left-radius: 3px;
 | 
					 | 
				
			||||||
                left: -1px;
 | 
					 | 
				
			||||||
                width: calc(100% + 2px);
 | 
					 | 
				
			||||||
                position: absolute;
 | 
					 | 
				
			||||||
            "
 | 
					 | 
				
			||||||
        >
 | 
					 | 
				
			||||||
            <span
 | 
					 | 
				
			||||||
                style="
 | 
					 | 
				
			||||||
                    background: #feca74;
 | 
					 | 
				
			||||||
                    color: #000;
 | 
					 | 
				
			||||||
                    top: -0.5em;
 | 
					 | 
				
			||||||
                    padding: 2px 3px;
 | 
					 | 
				
			||||||
                    position: absolute;
 | 
					 | 
				
			||||||
                    font-size: 0.6em;
 | 
					 | 
				
			||||||
                    font-weight: bold;
 | 
					 | 
				
			||||||
                    line-height: 1;
 | 
					 | 
				
			||||||
                    border-radius: 3px;
 | 
					 | 
				
			||||||
                "
 | 
					 | 
				
			||||||
            >
 | 
					 | 
				
			||||||
                GPE
 | 
					 | 
				
			||||||
            </span>
 | 
					 | 
				
			||||||
        </span>
 | 
					 | 
				
			||||||
    </span>
 | 
					 | 
				
			||||||
    .
 | 
					 | 
				
			||||||
</div>
 | 
					 | 
				
			||||||
| 
						 | 
					@ -107,6 +107,22 @@ const Image = ({ src, alt, title, href, ...props }) => {
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const ImageScrollable = ({ src, alt, width, ...props }) => {
 | 
				
			||||||
 | 
					    return (
 | 
				
			||||||
 | 
					        <figure className={classNames(classes.standalone, classes.scrollable)}>
 | 
				
			||||||
 | 
					            <img className={classes['image-scrollable']} src={src} alt={alt} width={width} height="auto" />
 | 
				
			||||||
 | 
					        </figure>
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const Standalone = ({ height, children, ...props }) => {
 | 
				
			||||||
 | 
					    return (
 | 
				
			||||||
 | 
					        <figure className={classes.standalone} style={{ height }}>
 | 
				
			||||||
 | 
					            {children}
 | 
				
			||||||
 | 
					        </figure>
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const ImageFill = ({ image, ...props }) => {
 | 
					const ImageFill = ({ image, ...props }) => {
 | 
				
			||||||
    return (
 | 
					    return (
 | 
				
			||||||
        <span
 | 
					        <span
 | 
				
			||||||
| 
						 | 
					@ -137,4 +153,4 @@ const GoogleSheet = ({ id, link, height, button = 'View full table' }) => {
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export { YouTube, SoundCloud, Iframe, Image, ImageFill, GoogleSheet }
 | 
					export { YouTube, SoundCloud, Iframe, Image, ImageFill, ImageScrollable, GoogleSheet, Standalone }
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ import Aside from './components/aside'
 | 
				
			||||||
import Button from './components/button'
 | 
					import Button from './components/button'
 | 
				
			||||||
import Tag from './components/tag'
 | 
					import Tag from './components/tag'
 | 
				
			||||||
import Grid from './components/grid'
 | 
					import Grid from './components/grid'
 | 
				
			||||||
import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from './components/embed'
 | 
					import { YouTube, SoundCloud, Iframe, Image, ImageScrollable, GoogleSheet, Standalone } from './components/embed'
 | 
				
			||||||
import Project from './widgets/project'
 | 
					import Project from './widgets/project'
 | 
				
			||||||
import { Integration, IntegrationLogo } from './widgets/integration.js'
 | 
					import { Integration, IntegrationLogo } from './widgets/integration.js'
 | 
				
			||||||
import { Logos, Colors, Patterns } from './widgets/styleguide'
 | 
					import { Logos, Colors, Patterns } from './widgets/styleguide'
 | 
				
			||||||
| 
						 | 
					@ -90,6 +90,8 @@ export const remarkComponents = {
 | 
				
			||||||
     * For regular img elements it is not possible to pass properties
 | 
					     * For regular img elements it is not possible to pass properties
 | 
				
			||||||
     */
 | 
					     */
 | 
				
			||||||
    Image,
 | 
					    Image,
 | 
				
			||||||
 | 
					    ImageScrollable,
 | 
				
			||||||
 | 
					    Standalone,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Label,
 | 
					    Label,
 | 
				
			||||||
    Logos,
 | 
					    Logos,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||