Merge pull request #13286 from explosion/master
Sync `docs/llm_main` with `master`
							
								
								
									
										1
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions] | ||||
							
								
								
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						|  | @ -58,7 +58,7 @@ jobs: | |||
|       fail-fast: true | ||||
|       matrix: | ||||
|         os: [ubuntu-latest, windows-latest, macos-latest] | ||||
|         python_version: ["3.11", "3.12.0-rc.2"] | ||||
|         python_version: ["3.12"] | ||||
|         include: | ||||
|           - os: windows-latest | ||||
|             python_version: "3.7" | ||||
|  | @ -68,6 +68,8 @@ jobs: | |||
|             python_version: "3.9" | ||||
|           - os: windows-latest | ||||
|             python_version: "3.10" | ||||
|           - os: macos-latest | ||||
|             python_version: "3.11" | ||||
| 
 | ||||
|     runs-on: ${{ matrix.os }} | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						|  | @ -1,6 +1,6 @@ | |||
| The MIT License (MIT) | ||||
| 
 | ||||
| Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal | ||||
| Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal | ||||
| 
 | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
|  |  | |||
							
								
								
									
										11
									
								
								README.md
									
									
									
									
									
								
							
							
						
						|  | @ -39,28 +39,35 @@ open-source software, released under the | |||
| | 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                                               | | ||||
| | 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                                          | | ||||
| | 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                                      | | ||||
| | ⏩ **[GPU Processing]**                                                                                                                                                                                                    | Use spaCy with CUDA-compatible GPU processing.                                                                                                                                                                                                                                                                                               | | ||||
| | 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                                        | | ||||
| | 🦙 **[Large Language Models]**                                                                                                                                                                                            | Integrate LLMs into spaCy pipelines.                                                                                                                                                                                                                                                                                                        | | ||||
| | 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                                               | | ||||
| | ⚙️ **[spaCy VS Code Extension]**                                                                                                                                                                                          | Additional tooling and features for working with spaCy's config files.                                                                                                                                                                                                                                                                       | | ||||
| | 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                                      | | ||||
| | 📰 **[Blog]**                                                                                                                                                                                                             | Read about current spaCy and Prodigy development, releases, talks and more from Explosion.                                                                                                                                                                                                                 | | ||||
| | 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    | | ||||
| | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 | | ||||
| | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        | | ||||
| | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)**                 | | ||||
| | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a>   | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** | | ||||
| | 👕 **[Swag]**                                                                                                                                                                                                             | Support us and our work with unique, custom-designed swag!                                                                                                                                                                                                                                                                                   | | ||||
| | <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)**                 | | ||||
| 
 | ||||
| [spacy 101]: https://spacy.io/usage/spacy-101 | ||||
| [new in v3.0]: https://spacy.io/usage/v3 | ||||
| [usage guides]: https://spacy.io/usage/ | ||||
| [api reference]: https://spacy.io/api/ | ||||
| [gpu processing]: https://spacy.io/usage#gpu | ||||
| [models]: https://spacy.io/models | ||||
| [large language models]: https://spacy.io/usage/large-language-models | ||||
| [universe]: https://spacy.io/universe | ||||
| [spacy vs code extension]: https://github.com/explosion/spacy-vscode | ||||
| [videos]: https://www.youtube.com/c/ExplosionAI | ||||
| [online course]: https://course.spacy.io | ||||
| [blog]: https://explosion.ai | ||||
| [project templates]: https://github.com/explosion/projects | ||||
| [changelog]: https://spacy.io/usage#changelog | ||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||
| [swag]: https://explosion.ai/merch | ||||
| 
 | ||||
| ## 💬 Where to ask questions | ||||
| 
 | ||||
|  |  | |||
|  | @ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
| SOFTWARE. | ||||
| 
 | ||||
| 
 | ||||
| SciPy | ||||
| ----- | ||||
| 
 | ||||
| * Files: scorer.py | ||||
| 
 | ||||
| The implementation of trapezoid() is adapted from SciPy, which is distributed | ||||
| under the following license: | ||||
| 
 | ||||
| New BSD License | ||||
| 
 | ||||
| Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers. | ||||
| All rights reserved. | ||||
| 
 | ||||
| Redistribution and use in source and binary forms, with or without | ||||
| modification, are permitted provided that the following conditions | ||||
| are met: | ||||
| 
 | ||||
| 1. Redistributions of source code must retain the above copyright | ||||
|    notice, this list of conditions and the following disclaimer. | ||||
| 
 | ||||
| 2. Redistributions in binary form must reproduce the above | ||||
|    copyright notice, this list of conditions and the following | ||||
|    disclaimer in the documentation and/or other materials provided | ||||
|    with the distribution. | ||||
| 
 | ||||
| 3. Neither the name of the copyright holder nor the names of its | ||||
|    contributors may be used to endorse or promote products derived | ||||
|    from this software without specific prior written permission. | ||||
| 
 | ||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||||
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||||
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||||
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||||
| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||||
| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||||
| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||||
| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||||
| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||||
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
|  |  | |||
|  | @ -5,7 +5,7 @@ requires = [ | |||
|     "cymem>=2.0.2,<2.1.0", | ||||
|     "preshed>=3.0.2,<3.1.0", | ||||
|     "murmurhash>=0.28.0,<1.1.0", | ||||
|     "thinc>=8.1.8,<8.3.0", | ||||
|     "thinc>=8.2.2,<8.3.0", | ||||
|     "numpy>=1.15.0; python_version < '3.9'", | ||||
|     "numpy>=1.25.0; python_version >= '3.9'", | ||||
| ] | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0 | |||
| spacy-loggers>=1.0.0,<2.0.0 | ||||
| cymem>=2.0.2,<2.1.0 | ||||
| preshed>=3.0.2,<3.1.0 | ||||
| thinc>=8.1.8,<8.3.0 | ||||
| thinc>=8.2.2,<8.3.0 | ||||
| ml_datasets>=0.2.0,<0.3.0 | ||||
| murmurhash>=0.28.0,<1.1.0 | ||||
| wasabi>=0.9.1,<1.2.0 | ||||
|  |  | |||
|  | @ -41,7 +41,7 @@ setup_requires = | |||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     thinc>=8.1.8,<8.3.0 | ||||
|     thinc>=8.2.2,<8.3.0 | ||||
| install_requires = | ||||
|     # Our libraries | ||||
|     spacy-legacy>=3.0.11,<3.1.0 | ||||
|  | @ -49,7 +49,7 @@ install_requires = | |||
|     murmurhash>=0.28.0,<1.1.0 | ||||
|     cymem>=2.0.2,<2.1.0 | ||||
|     preshed>=3.0.2,<3.1.0 | ||||
|     thinc>=8.1.8,<8.3.0 | ||||
|     thinc>=8.2.2,<8.3.0 | ||||
|     wasabi>=0.9.1,<1.2.0 | ||||
|     srsly>=2.4.3,<3.0.0 | ||||
|     catalogue>=2.0.6,<2.1.0 | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| # fmt: off | ||||
| __title__ = "spacy" | ||||
| __version__ = "3.7.1" | ||||
| __version__ = "3.7.2" | ||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||
|  |  | |||
|  | @ -22,8 +22,17 @@ from .init_pipeline import init_pipeline_cli  # noqa: F401 | |||
| from .package import package  # noqa: F401 | ||||
| from .pretrain import pretrain  # noqa: F401 | ||||
| from .profile import profile  # noqa: F401 | ||||
| from .train import train_cli  # noqa: F401 | ||||
| from .validate import validate  # noqa: F401 | ||||
| from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.document import (  # type: ignore[attr-defined]  # noqa: F401 | ||||
|     project_document, | ||||
| ) | ||||
| from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401 | ||||
| from .validate import validate  # type: ignore[attr-defined]  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ from .. import util | |||
| from ..language import Language | ||||
| from ..tokens import Doc | ||||
| from ..training import Corpus | ||||
| from ._util import Arg, Opt, benchmark_cli, setup_gpu | ||||
| from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu | ||||
| 
 | ||||
| 
 | ||||
| @benchmark_cli.command( | ||||
|  | @ -30,12 +30,14 @@ def benchmark_speed_cli( | |||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||
|     n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), | ||||
|     warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), | ||||
|     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|     Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark | ||||
|     data in the binary .spacy format. | ||||
|     """ | ||||
|     import_code(code_path) | ||||
|     setup_gpu(use_gpu=use_gpu, silent=False) | ||||
| 
 | ||||
|     nlp = util.load_model(model) | ||||
|  | @ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray): | |||
| def warmup( | ||||
|     nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] | ||||
| ) -> numpy.ndarray: | ||||
|     docs = warmup_epochs * docs | ||||
|     docs = [doc.copy() for doc in docs * warmup_epochs] | ||||
|     return annotate(nlp, docs, batch_size) | ||||
|  |  | |||
|  | @ -7,7 +7,14 @@ from wasabi import msg | |||
| 
 | ||||
| from .. import about | ||||
| from ..errors import OLD_MODEL_SHORTCUTS | ||||
| from ..util import get_minor_version, is_package, is_prerelease_version, run_command | ||||
| from ..util import ( | ||||
|     get_minor_version, | ||||
|     is_in_interactive, | ||||
|     is_in_jupyter, | ||||
|     is_package, | ||||
|     is_prerelease_version, | ||||
|     run_command, | ||||
| ) | ||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app | ||||
| 
 | ||||
| 
 | ||||
|  | @ -77,6 +84,27 @@ def download( | |||
|         "Download and installation successful", | ||||
|         f"You can now load the package via spacy.load('{model_name}')", | ||||
|     ) | ||||
|     if is_in_jupyter(): | ||||
|         reload_deps_msg = ( | ||||
|             "If you are in a Jupyter or Colab notebook, you may need to " | ||||
|             "restart Python in order to load all the package's dependencies. " | ||||
|             "You can do this by selecting the 'Restart kernel' or 'Restart " | ||||
|             "runtime' option." | ||||
|         ) | ||||
|         msg.warn( | ||||
|             "Restart to reload dependencies", | ||||
|             reload_deps_msg, | ||||
|         ) | ||||
|     elif is_in_interactive(): | ||||
|         reload_deps_msg = ( | ||||
|             "If you are in an interactive Python session, you may need to " | ||||
|             "exit and restart Python to load all the package's dependencies. " | ||||
|             "You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)." | ||||
|         ) | ||||
|         msg.warn( | ||||
|             "Restart to reload dependencies", | ||||
|             reload_deps_msg, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | ||||
|  |  | |||
|  | @ -1,5 +1,7 @@ | |||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import sys | ||||
| from collections import defaultdict | ||||
| from pathlib import Path | ||||
|  | @ -11,6 +13,7 @@ from thinc.api import Config | |||
| from wasabi import MarkdownRenderer, Printer, get_raw_input | ||||
| 
 | ||||
| from .. import about, util | ||||
| from ..compat import importlib_metadata | ||||
| from ..schemas import ModelMetaSchema, validate | ||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list | ||||
| 
 | ||||
|  | @ -35,7 +38,7 @@ def package_cli( | |||
|     specified output directory, and the data will be copied over. If | ||||
|     --create-meta is set and a meta.json already exists in the output directory, | ||||
|     the existing values will be used as the defaults in the command-line prompt. | ||||
|     After packaging, "python setup.py sdist" is run in the package directory, | ||||
|     After packaging, "python -m build --sdist" is run in the package directory, | ||||
|     which will create a .tar.gz archive that can be installed via "pip install". | ||||
| 
 | ||||
|     If additional code files are provided (e.g. Python files containing custom | ||||
|  | @ -78,9 +81,17 @@ def package( | |||
|     input_path = util.ensure_path(input_dir) | ||||
|     output_path = util.ensure_path(output_dir) | ||||
|     meta_path = util.ensure_path(meta_path) | ||||
|     if create_wheel and not has_wheel(): | ||||
|         err = "Generating a binary .whl file requires wheel to be installed" | ||||
|         msg.fail(err, "pip install wheel", exits=1) | ||||
|     if create_wheel and not has_wheel() and not has_build(): | ||||
|         err = ( | ||||
|             "Generating wheels requires 'build' or 'wheel' (deprecated) to be installed" | ||||
|         ) | ||||
|         msg.fail(err, "pip install build", exits=1) | ||||
|     if not has_build(): | ||||
|         msg.warn( | ||||
|             "Generating packages without the 'build' package is deprecated and " | ||||
|             "will not be supported in the future. To install 'build': pip " | ||||
|             "install build" | ||||
|         ) | ||||
|     if not input_path or not input_path.exists(): | ||||
|         msg.fail("Can't locate pipeline data", input_path, exits=1) | ||||
|     if not output_path or not output_path.exists(): | ||||
|  | @ -184,12 +195,37 @@ def package( | |||
|     msg.good(f"Successfully created package directory '{model_name_v}'", main_path) | ||||
|     if create_sdist: | ||||
|         with util.working_dir(main_path): | ||||
|             util.run_command([sys.executable, "setup.py", "sdist"], capture=False) | ||||
|             # run directly, since util.run_command is not designed to continue | ||||
|             # after a command fails | ||||
|             ret = subprocess.run( | ||||
|                 [sys.executable, "-m", "build", ".", "--sdist"], | ||||
|                 env=os.environ.copy(), | ||||
|             ) | ||||
|             if ret.returncode != 0: | ||||
|                 msg.warn( | ||||
|                     "Creating sdist with 'python -m build' failed. Falling " | ||||
|                     "back to deprecated use of 'python setup.py sdist'" | ||||
|                 ) | ||||
|                 util.run_command([sys.executable, "setup.py", "sdist"], capture=False) | ||||
|         zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" | ||||
|         msg.good(f"Successfully created zipped Python package", zip_file) | ||||
|     if create_wheel: | ||||
|         with util.working_dir(main_path): | ||||
|             util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) | ||||
|             # run directly, since util.run_command is not designed to continue | ||||
|             # after a command fails | ||||
|             ret = subprocess.run( | ||||
|                 [sys.executable, "-m", "build", ".", "--wheel"], | ||||
|                 env=os.environ.copy(), | ||||
|             ) | ||||
|             if ret.returncode != 0: | ||||
|                 msg.warn( | ||||
|                     "Creating wheel with 'python -m build' failed. Falling " | ||||
|                     "back to deprecated use of 'wheel' with " | ||||
|                     "'python setup.py bdist_wheel'" | ||||
|                 ) | ||||
|                 util.run_command( | ||||
|                     [sys.executable, "setup.py", "bdist_wheel"], capture=False | ||||
|                 ) | ||||
|         wheel_name_squashed = re.sub("_+", "_", model_name_v) | ||||
|         wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" | ||||
|         msg.good(f"Successfully created binary wheel", wheel) | ||||
|  | @ -209,6 +245,17 @@ def has_wheel() -> bool: | |||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def has_build() -> bool: | ||||
|     # it's very likely that there is a local directory named build/ (especially | ||||
|     # in an editable install), so an import check is not sufficient; instead | ||||
|     # check that there is a package version | ||||
|     try: | ||||
|         importlib_metadata.version("build") | ||||
|         return True | ||||
|     except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined] | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def get_third_party_dependencies( | ||||
|     config: Config, exclude: List[str] = util.SimpleFrozenList() | ||||
| ) -> List[str]: | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/cli/project/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										1
									
								
								spacy/cli/project/assets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.assets import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/clone.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.clone import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/document.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.document import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/dvc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.dvc import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/pull.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.pull import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/push.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.push import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/remote_storage.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.remote_storage import * | ||||
							
								
								
									
										1
									
								
								spacy/cli/project/run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | |||
| from weasel.cli.run import * | ||||
|  | @ -271,8 +271,9 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  | @ -308,8 +309,9 @@ grad_factor = 1.0 | |||
| @layers = "reduce_mean.v1" | ||||
| 
 | ||||
| [components.textcat_multilabel.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  | @ -542,14 +544,15 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
|  | @ -570,15 +573,17 @@ nO = null | |||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat_multilabel.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat_multilabel.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| {%- endif %} | ||||
|  |  | |||
|  | @ -142,7 +142,25 @@ class SpanRenderer: | |||
|         spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. | ||||
|         title (str / None): Document title set in Doc.user_data['title']. | ||||
|         """ | ||||
|         per_token_info = [] | ||||
|         per_token_info = self._assemble_per_token_info(tokens, spans) | ||||
|         markup = self._render_markup(per_token_info) | ||||
|         markup = TPL_SPANS.format(content=markup, dir=self.direction) | ||||
|         if title: | ||||
|             markup = TPL_TITLE.format(title=title) + markup | ||||
|         return markup | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _assemble_per_token_info( | ||||
|         tokens: List[str], spans: List[Dict[str, Any]] | ||||
|     ) -> List[Dict[str, List[Dict[str, Any]]]]: | ||||
|         """Assembles token info used to generate markup in render_spans(). | ||||
|         tokens (List[str]): Tokens in text. | ||||
|         spans (List[Dict[str, Any]]): Spans in text. | ||||
|         RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens | ||||
|             and spans. | ||||
|         """ | ||||
|         per_token_info: List[Dict[str, List[Dict[str, Any]]]] = [] | ||||
| 
 | ||||
|         # we must sort so that we can correctly describe when spans need to "stack" | ||||
|         # which is determined by their start token, then span length (longer spans on top), | ||||
|         # then break any remaining ties with the span label | ||||
|  | @ -154,21 +172,22 @@ class SpanRenderer: | |||
|                 s["label"], | ||||
|             ), | ||||
|         ) | ||||
| 
 | ||||
|         for s in spans: | ||||
|             # this is the vertical 'slot' that the span will be rendered in | ||||
|             # vertical_position = span_label_offset + (offset_step * (slot - 1)) | ||||
|             s["render_slot"] = 0 | ||||
| 
 | ||||
|         for idx, token in enumerate(tokens): | ||||
|             # Identify if a token belongs to a Span (and which) and if it's a | ||||
|             # start token of said Span. We'll use this for the final HTML render | ||||
|             token_markup: Dict[str, Any] = {} | ||||
|             token_markup["text"] = token | ||||
|             concurrent_spans = 0 | ||||
|             intersecting_spans: List[Dict[str, Any]] = [] | ||||
|             entities = [] | ||||
|             for span in spans: | ||||
|                 ent = {} | ||||
|                 if span["start_token"] <= idx < span["end_token"]: | ||||
|                     concurrent_spans += 1 | ||||
|                     span_start = idx == span["start_token"] | ||||
|                     ent["label"] = span["label"] | ||||
|                     ent["is_start"] = span_start | ||||
|  | @ -176,7 +195,12 @@ class SpanRenderer: | |||
|                         # When the span starts, we need to know how many other | ||||
|                         # spans are on the 'span stack' and will be rendered. | ||||
|                         # This value becomes the vertical render slot for this entire span | ||||
|                         span["render_slot"] = concurrent_spans | ||||
|                         span["render_slot"] = ( | ||||
|                             intersecting_spans[-1]["render_slot"] | ||||
|                             if len(intersecting_spans) | ||||
|                             else 0 | ||||
|                         ) + 1 | ||||
|                     intersecting_spans.append(span) | ||||
|                     ent["render_slot"] = span["render_slot"] | ||||
|                     kb_id = span.get("kb_id", "") | ||||
|                     kb_url = span.get("kb_url", "#") | ||||
|  | @ -193,11 +217,8 @@ class SpanRenderer: | |||
|                     span["render_slot"] = 0 | ||||
|             token_markup["entities"] = entities | ||||
|             per_token_info.append(token_markup) | ||||
|         markup = self._render_markup(per_token_info) | ||||
|         markup = TPL_SPANS.format(content=markup, dir=self.direction) | ||||
|         if title: | ||||
|             markup = TPL_TITLE.format(title=title) + markup | ||||
|         return markup | ||||
| 
 | ||||
|         return per_token_info | ||||
| 
 | ||||
|     def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: | ||||
|         """Render the markup from per-token information""" | ||||
|  |  | |||
|  | @ -227,7 +227,6 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|     E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " | ||||
|             "This usually happens when spaCy calls `nlp.{method}` with a custom " | ||||
|             "component name that's not registered on the current language class. " | ||||
|             "If you're using a Transformer, make sure to install 'spacy-transformers'. " | ||||
|             "If you're using a custom component, make sure you've added the " | ||||
|             "decorator `@Language.component` (for function components) or " | ||||
|             "`@Language.factory` (for class components).\n\nAvailable " | ||||
|  | @ -984,6 +983,10 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|              "predicted docs when training {component}.") | ||||
|     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " | ||||
|              "but only callbacks with one or three parameters are supported") | ||||
|     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.") | ||||
|     E1057 = ("The `TextCatReduce` architecture must be used with at least one " | ||||
|              "reduction. Please enable one of `use_reduce_first`, " | ||||
|              "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.") | ||||
| 
 | ||||
| 
 | ||||
| # Deprecated model shortcuts, only used in errors and warnings | ||||
|  |  | |||
|  | @ -1,3 +1,11 @@ | |||
| from .candidate import Candidate, get_candidates, get_candidates_batch | ||||
| from .kb import KnowledgeBase | ||||
| from .kb_in_memory import InMemoryLookupKB | ||||
| 
 | ||||
| __all__ = [ | ||||
|     "Candidate", | ||||
|     "KnowledgeBase", | ||||
|     "InMemoryLookupKB", | ||||
|     "get_candidates", | ||||
|     "get_candidates_batch", | ||||
| ] | ||||
|  |  | |||
|  | @ -6,7 +6,8 @@ _num_words = [ | |||
|     "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", | ||||
|     "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", | ||||
|     "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", | ||||
|     "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion" | ||||
|     "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion", | ||||
|     "septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion" | ||||
| ] | ||||
| _ordinal_words = [ | ||||
|     "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", | ||||
|  | @ -14,7 +15,8 @@ _ordinal_words = [ | |||
|     "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", | ||||
|     "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", | ||||
|     "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth", | ||||
|     "trillionth", "quadrillionth", "gajillionth", "bazillionth" | ||||
|     "trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth", | ||||
|     "octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth" | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										18
									
								
								spacy/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,18 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class FaroeseDefaults(BaseDefaults): | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
| 
 | ||||
| 
 | ||||
| class Faroese(Language): | ||||
|     lang = "fo" | ||||
|     Defaults = FaroeseDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["Faroese"] | ||||
							
								
								
									
										90
									
								
								spacy/lang/fo/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,90 @@ | |||
| from ...symbols import ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| for orth in [ | ||||
|     "apr.", | ||||
|     "aug.", | ||||
|     "avgr.", | ||||
|     "árg.", | ||||
|     "ávís.", | ||||
|     "beinl.", | ||||
|     "blkv.", | ||||
|     "blaðkv.", | ||||
|     "blm.", | ||||
|     "blaðm.", | ||||
|     "bls.", | ||||
|     "blstj.", | ||||
|     "blaðstj.", | ||||
|     "des.", | ||||
|     "eint.", | ||||
|     "febr.", | ||||
|     "fyrrv.", | ||||
|     "góðk.", | ||||
|     "h.m.", | ||||
|     "innt.", | ||||
|     "jan.", | ||||
|     "kl.", | ||||
|     "m.a.", | ||||
|     "mðr.", | ||||
|     "mió.", | ||||
|     "nr.", | ||||
|     "nto.", | ||||
|     "nov.", | ||||
|     "nút.", | ||||
|     "o.a.", | ||||
|     "o.a.m.", | ||||
|     "o.a.tíl.", | ||||
|     "o.fl.", | ||||
|     "ff.", | ||||
|     "o.m.a.", | ||||
|     "o.o.", | ||||
|     "o.s.fr.", | ||||
|     "o.tíl.", | ||||
|     "o.ø.", | ||||
|     "okt.", | ||||
|     "omf.", | ||||
|     "pst.", | ||||
|     "ritstj.", | ||||
|     "sbr.", | ||||
|     "sms.", | ||||
|     "smst.", | ||||
|     "smb.", | ||||
|     "sb.", | ||||
|     "sbrt.", | ||||
|     "sp.", | ||||
|     "sept.", | ||||
|     "spf.", | ||||
|     "spsk.", | ||||
|     "t.e.", | ||||
|     "t.s.", | ||||
|     "t.s.s.", | ||||
|     "tlf.", | ||||
|     "tel.", | ||||
|     "tsk.", | ||||
|     "t.o.v.", | ||||
|     "t.d.", | ||||
|     "uml.", | ||||
|     "ums.", | ||||
|     "uppl.", | ||||
|     "upprfr.", | ||||
|     "uppr.", | ||||
|     "útg.", | ||||
|     "útl.", | ||||
|     "útr.", | ||||
|     "vanl.", | ||||
|     "v.", | ||||
|     "v.h.", | ||||
|     "v.ø.o.", | ||||
|     "viðm.", | ||||
|     "viðv.", | ||||
|     "vm.", | ||||
|     "v.m.", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
|     capitalized = orth.capitalize() | ||||
|     _exc[capitalized] = [{ORTH: capitalized}] | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||
							
								
								
									
										20
									
								
								spacy/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,20 @@ | |||
| from ...language import BaseDefaults, Language | ||||
| from ..nb import SYNTAX_ITERATORS | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class NorwegianNynorskDefaults(BaseDefaults): | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
|     prefixes = TOKENIZER_PREFIXES | ||||
|     infixes = TOKENIZER_INFIXES | ||||
|     suffixes = TOKENIZER_SUFFIXES | ||||
|     syntax_iterators = SYNTAX_ITERATORS | ||||
| 
 | ||||
| 
 | ||||
| class NorwegianNynorsk(Language): | ||||
|     lang = "nn" | ||||
|     Defaults = NorwegianNynorskDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["NorwegianNynorsk"] | ||||
							
								
								
									
										15
									
								
								spacy/lang/nn/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,15 @@ | |||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.nn.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) | ||||
| sentences = [ | ||||
|     "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", | ||||
|     "Det er ein meir enn i same periode i fjor.", | ||||
|     "Det har lava ned enorme snømengder i store delar av Europa den siste tida.", | ||||
|     "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.", | ||||
| ] | ||||
							
								
								
									
										74
									
								
								spacy/lang/nn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,74 @@ | |||
| from ..char_classes import ( | ||||
|     ALPHA, | ||||
|     ALPHA_LOWER, | ||||
|     ALPHA_UPPER, | ||||
|     CONCAT_QUOTES, | ||||
|     CURRENCY, | ||||
|     LIST_CURRENCY, | ||||
|     LIST_ELLIPSES, | ||||
|     LIST_ICONS, | ||||
|     LIST_PUNCT, | ||||
|     LIST_QUOTES, | ||||
|     PUNCT, | ||||
|     UNITS, | ||||
| ) | ||||
| from ..punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
| _list_punct = [x for x in LIST_PUNCT if x != "#"] | ||||
| _list_icons = [x for x in LIST_ICONS if x != "°"] | ||||
| _list_icons = [x.replace("\\u00B0", "") for x in _list_icons] | ||||
| _list_quotes = [x for x in LIST_QUOTES if x != "\\'"] | ||||
| 
 | ||||
| 
 | ||||
| _prefixes = ( | ||||
|     ["§", "%", "=", "—", "–", r"\+(?![0-9])"] | ||||
|     + _list_punct | ||||
|     + LIST_ELLIPSES | ||||
|     + LIST_QUOTES | ||||
|     + LIST_CURRENCY | ||||
|     + LIST_ICONS | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|     + _list_icons | ||||
|     + [ | ||||
|         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| _suffixes = ( | ||||
|     LIST_PUNCT | ||||
|     + LIST_ELLIPSES | ||||
|     + _list_quotes | ||||
|     + _list_icons | ||||
|     + ["—", "–"] | ||||
|     + [ | ||||
|         r"(?<=[0-9])\+", | ||||
|         r"(?<=°[FfCcKk])\.", | ||||
|         r"(?<=[0-9])(?:{c})".format(c=CURRENCY), | ||||
|         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||
|         r"(?<=[{al}{e}{p}(?:{q})])\.".format( | ||||
|             al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT | ||||
|         ), | ||||
|         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), | ||||
|     ] | ||||
|     + [r"(?<=[^sSxXzZ])'"] | ||||
| ) | ||||
| _suffixes += [ | ||||
|     suffix | ||||
|     for suffix in TOKENIZER_SUFFIXES | ||||
|     if suffix not in ["'s", "'S", "’s", "’S", r"\'"] | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_PREFIXES = _prefixes | ||||
| TOKENIZER_INFIXES = _infixes | ||||
| TOKENIZER_SUFFIXES = _suffixes | ||||
							
								
								
									
										228
									
								
								spacy/lang/nn/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,228 @@ | |||
| from ...symbols import NORM, ORTH | ||||
| from ...util import update_exc | ||||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| 
 | ||||
| _exc = {} | ||||
| 
 | ||||
| 
 | ||||
| for exc_data in [ | ||||
|     {ORTH: "jan.", NORM: "januar"}, | ||||
|     {ORTH: "feb.", NORM: "februar"}, | ||||
|     {ORTH: "mar.", NORM: "mars"}, | ||||
|     {ORTH: "apr.", NORM: "april"}, | ||||
|     {ORTH: "jun.", NORM: "juni"}, | ||||
|     # note: "jul." is in the simple list below without a NORM exception | ||||
|     {ORTH: "aug.", NORM: "august"}, | ||||
|     {ORTH: "sep.", NORM: "september"}, | ||||
|     {ORTH: "okt.", NORM: "oktober"}, | ||||
|     {ORTH: "nov.", NORM: "november"}, | ||||
|     {ORTH: "des.", NORM: "desember"}, | ||||
| ]: | ||||
|     _exc[exc_data[ORTH]] = [exc_data] | ||||
| 
 | ||||
| 
 | ||||
| for orth in [ | ||||
|     "Ap.", | ||||
|     "Aq.", | ||||
|     "Ca.", | ||||
|     "Chr.", | ||||
|     "Co.", | ||||
|     "Dr.", | ||||
|     "F.eks.", | ||||
|     "Fr.p.", | ||||
|     "Frp.", | ||||
|     "Grl.", | ||||
|     "Kr.", | ||||
|     "Kr.F.", | ||||
|     "Kr.F.s", | ||||
|     "Mr.", | ||||
|     "Mrs.", | ||||
|     "Pb.", | ||||
|     "Pr.", | ||||
|     "Sp.", | ||||
|     "St.", | ||||
|     "a.m.", | ||||
|     "ad.", | ||||
|     "adm.dir.", | ||||
|     "adr.", | ||||
|     "b.c.", | ||||
|     "bl.a.", | ||||
|     "bla.", | ||||
|     "bm.", | ||||
|     "bnr.", | ||||
|     "bto.", | ||||
|     "c.c.", | ||||
|     "ca.", | ||||
|     "cand.mag.", | ||||
|     "co.", | ||||
|     "d.d.", | ||||
|     "d.m.", | ||||
|     "d.y.", | ||||
|     "dept.", | ||||
|     "dr.", | ||||
|     "dr.med.", | ||||
|     "dr.philos.", | ||||
|     "dr.psychol.", | ||||
|     "dss.", | ||||
|     "dvs.", | ||||
|     "e.Kr.", | ||||
|     "e.l.", | ||||
|     "eg.", | ||||
|     "eig.", | ||||
|     "ekskl.", | ||||
|     "el.", | ||||
|     "et.", | ||||
|     "etc.", | ||||
|     "etg.", | ||||
|     "ev.", | ||||
|     "evt.", | ||||
|     "f.", | ||||
|     "f.Kr.", | ||||
|     "f.eks.", | ||||
|     "f.o.m.", | ||||
|     "fhv.", | ||||
|     "fk.", | ||||
|     "foreg.", | ||||
|     "fork.", | ||||
|     "fv.", | ||||
|     "fvt.", | ||||
|     "g.", | ||||
|     "gl.", | ||||
|     "gno.", | ||||
|     "gnr.", | ||||
|     "grl.", | ||||
|     "gt.", | ||||
|     "h.r.adv.", | ||||
|     "hhv.", | ||||
|     "hoh.", | ||||
|     "hr.", | ||||
|     "ifb.", | ||||
|     "ifm.", | ||||
|     "iht.", | ||||
|     "inkl.", | ||||
|     "istf.", | ||||
|     "jf.", | ||||
|     "jr.", | ||||
|     "jul.", | ||||
|     "juris.", | ||||
|     "kfr.", | ||||
|     "kgl.", | ||||
|     "kgl.res.", | ||||
|     "kl.", | ||||
|     "komm.", | ||||
|     "kr.", | ||||
|     "kst.", | ||||
|     "lat.", | ||||
|     "lø.", | ||||
|     "m.a.", | ||||
|     "m.a.o.", | ||||
|     "m.fl.", | ||||
|     "m.m.", | ||||
|     "m.v.", | ||||
|     "ma.", | ||||
|     "mag.art.", | ||||
|     "md.", | ||||
|     "mfl.", | ||||
|     "mht.", | ||||
|     "mill.", | ||||
|     "min.", | ||||
|     "mnd.", | ||||
|     "moh.", | ||||
|     "mrd.", | ||||
|     "muh.", | ||||
|     "mv.", | ||||
|     "mva.", | ||||
|     "n.å.", | ||||
|     "ndf.", | ||||
|     "nr.", | ||||
|     "nto.", | ||||
|     "nyno.", | ||||
|     "o.a.", | ||||
|     "o.l.", | ||||
|     "obl.", | ||||
|     "off.", | ||||
|     "ofl.", | ||||
|     "on.", | ||||
|     "op.", | ||||
|     "org.", | ||||
|     "osv.", | ||||
|     "ovf.", | ||||
|     "p.", | ||||
|     "p.a.", | ||||
|     "p.g.a.", | ||||
|     "p.m.", | ||||
|     "p.t.", | ||||
|     "pga.", | ||||
|     "ph.d.", | ||||
|     "pkt.", | ||||
|     "pr.", | ||||
|     "pst.", | ||||
|     "pt.", | ||||
|     "red.anm.", | ||||
|     "ref.", | ||||
|     "res.", | ||||
|     "res.kap.", | ||||
|     "resp.", | ||||
|     "rv.", | ||||
|     "s.", | ||||
|     "s.d.", | ||||
|     "s.k.", | ||||
|     "s.u.", | ||||
|     "s.å.", | ||||
|     "sen.", | ||||
|     "sep.", | ||||
|     "siviling.", | ||||
|     "sms.", | ||||
|     "snr.", | ||||
|     "spm.", | ||||
|     "sr.", | ||||
|     "sst.", | ||||
|     "st.", | ||||
|     "st.meld.", | ||||
|     "st.prp.", | ||||
|     "stip.", | ||||
|     "stk.", | ||||
|     "stud.", | ||||
|     "sv.", | ||||
|     "såk.", | ||||
|     "sø.", | ||||
|     "t.d.", | ||||
|     "t.h.", | ||||
|     "t.o.m.", | ||||
|     "t.v.", | ||||
|     "temp.", | ||||
|     "ti.", | ||||
|     "tils.", | ||||
|     "tilsv.", | ||||
|     "tl;dr", | ||||
|     "tlf.", | ||||
|     "to.", | ||||
|     "ult.", | ||||
|     "utg.", | ||||
|     "v.", | ||||
|     "vedk.", | ||||
|     "vedr.", | ||||
|     "vg.", | ||||
|     "vgs.", | ||||
|     "vha.", | ||||
|     "vit.ass.", | ||||
|     "vn.", | ||||
|     "vol.", | ||||
|     "vs.", | ||||
|     "vsa.", | ||||
|     "§§", | ||||
|     "©NTB", | ||||
|     "årg.", | ||||
|     "årh.", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| # Dates | ||||
| for h in range(1, 31 + 1): | ||||
|     for period in ["."]: | ||||
|         _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] | ||||
| 
 | ||||
| _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]} | ||||
| _exc.update(_custom_base_exc) | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||
|  | @ -1683,6 +1683,12 @@ class Language: | |||
|         for proc in procs: | ||||
|             proc.start() | ||||
| 
 | ||||
|         # Close writing-end of channels. This is needed to avoid that reading | ||||
|         # from the channel blocks indefinitely when the worker closes the | ||||
|         # channel. | ||||
|         for tx in bytedocs_send_ch: | ||||
|             tx.close() | ||||
| 
 | ||||
|         # Cycle channels not to break the order of docs. | ||||
|         # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. | ||||
|         byte_tuples = chain.from_iterable( | ||||
|  | @ -1705,8 +1711,23 @@ class Language: | |||
|                     # tell `sender` that one batch was consumed. | ||||
|                     sender.step() | ||||
|         finally: | ||||
|             # If we are stopping in an orderly fashion, the workers' queues | ||||
|             # are empty. Put the sentinel in their queues to signal that work | ||||
|             # is done, so that they can exit gracefully. | ||||
|             for q in texts_q: | ||||
|                 q.put(_WORK_DONE_SENTINEL) | ||||
| 
 | ||||
|             # Otherwise, we are stopping because the error handler raised an | ||||
|             # exception. The sentinel will be last to go out of the queue. | ||||
|             # To avoid doing unnecessary work or hanging on platforms that | ||||
|             # block on sending (Windows), we'll close our end of the channel. | ||||
|             # This signals to the worker that it can exit the next time it | ||||
|             # attempts to send data down the channel. | ||||
|             for r in bytedocs_recv_ch: | ||||
|                 r.close() | ||||
| 
 | ||||
|             for proc in procs: | ||||
|                 proc.terminate() | ||||
|                 proc.join() | ||||
| 
 | ||||
|     def _link_components(self) -> None: | ||||
|         """Register 'listeners' within pipeline components, to allow them to | ||||
|  | @ -2323,6 +2344,11 @@ def _apply_pipes( | |||
|     while True: | ||||
|         try: | ||||
|             texts_with_ctx = receiver.get() | ||||
| 
 | ||||
|             # Stop working if we encounter the end-of-work sentinel. | ||||
|             if isinstance(texts_with_ctx, _WorkDoneSentinel): | ||||
|                 return | ||||
| 
 | ||||
|             docs = ( | ||||
|                 ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx | ||||
|             ) | ||||
|  | @ -2331,11 +2357,21 @@ def _apply_pipes( | |||
|             # Connection does not accept unpickable objects, so send list. | ||||
|             byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs] | ||||
|             padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs)) | ||||
|             sender.send(byte_docs + padding)  # type: ignore[operator] | ||||
|             data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = ( | ||||
|                 byte_docs + padding  # type: ignore[operator] | ||||
|             ) | ||||
|         except Exception: | ||||
|             error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))] | ||||
|             padding = [(None, None, None)] * (len(texts_with_ctx) - 1) | ||||
|             sender.send(error_msg + padding) | ||||
|             data = error_msg + padding | ||||
| 
 | ||||
|         try: | ||||
|             sender.send(data) | ||||
|         except BrokenPipeError: | ||||
|             # Parent has closed the pipe prematurely. This happens when a | ||||
|             # worker encounters an error and the error handler is set to | ||||
|             # stop processing. | ||||
|             return | ||||
| 
 | ||||
| 
 | ||||
| class _Sender: | ||||
|  | @ -2365,3 +2401,10 @@ class _Sender: | |||
|         if self.count >= self.chunk_size: | ||||
|             self.count = 0 | ||||
|             self.send() | ||||
| 
 | ||||
| 
 | ||||
| class _WorkDoneSentinel: | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| _WORK_DONE_SENTINEL = _WorkDoneSentinel() | ||||
|  |  | |||
|  | @ -3,4 +3,4 @@ from .levenshtein import levenshtein | |||
| from .matcher import Matcher | ||||
| from .phrasematcher import PhraseMatcher | ||||
| 
 | ||||
| __all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"] | ||||
| __all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"] | ||||
|  |  | |||
|  | @ -1,21 +1,27 @@ | |||
| from functools import partial | ||||
| from typing import List, Optional, cast | ||||
| from typing import List, Optional, Tuple, cast | ||||
| 
 | ||||
| from thinc.api import ( | ||||
|     Dropout, | ||||
|     Gelu, | ||||
|     LayerNorm, | ||||
|     Linear, | ||||
|     Logistic, | ||||
|     Maxout, | ||||
|     Model, | ||||
|     ParametricAttention, | ||||
|     ParametricAttention_v2, | ||||
|     Relu, | ||||
|     Softmax, | ||||
|     SparseLinear, | ||||
|     SparseLinear_v2, | ||||
|     chain, | ||||
|     clone, | ||||
|     concatenate, | ||||
|     list2ragged, | ||||
|     reduce_first, | ||||
|     reduce_last, | ||||
|     reduce_max, | ||||
|     reduce_mean, | ||||
|     reduce_sum, | ||||
|     residual, | ||||
|  | @ -25,9 +31,10 @@ from thinc.api import ( | |||
| ) | ||||
| from thinc.layers.chain import init as init_chain | ||||
| from thinc.layers.resizable import resize_linear_weighted, resize_model | ||||
| from thinc.types import Floats2d | ||||
| from thinc.types import ArrayXd, Floats2d | ||||
| 
 | ||||
| from ...attrs import ORTH | ||||
| from ...errors import Errors | ||||
| from ...tokens import Doc | ||||
| from ...util import registry | ||||
| from ..extract_ngrams import extract_ngrams | ||||
|  | @ -47,39 +54,15 @@ def build_simple_cnn_text_classifier( | |||
|     outputs sum to 1. If exclusive_classes=False, a logistic non-linearity | ||||
|     is applied instead, so that outputs are in the range [0, 1]. | ||||
|     """ | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         cnn = tok2vec >> list2ragged() >> reduce_mean() | ||||
|         nI = tok2vec.maybe_get_dim("nO") | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=nI) | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|             resizable_layer: Model = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO, nI=nI) | ||||
|             resizable_layer = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer >> Logistic() | ||||
|         model.set_ref("output_layer", output_layer) | ||||
|         model.attrs["resize_output"] = partial( | ||||
|             resize_and_set_ref, | ||||
|             resizable_layer=resizable_layer, | ||||
|         ) | ||||
|     model.set_ref("tok2vec", tok2vec) | ||||
|     if nO is not None: | ||||
|         model.set_dim("nO", cast(int, nO)) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
|     return model | ||||
|     return build_reduce_text_classifier( | ||||
|         tok2vec=tok2vec, | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         use_reduce_first=False, | ||||
|         use_reduce_last=False, | ||||
|         use_reduce_max=False, | ||||
|         use_reduce_mean=True, | ||||
|         nO=nO, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def resize_and_set_ref(model, new_nO, resizable_layer): | ||||
|  | @ -95,10 +78,48 @@ def build_bow_text_classifier( | |||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     return _build_bow_text_classifier( | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         ngram_size=ngram_size, | ||||
|         no_output_layer=no_output_layer, | ||||
|         nO=nO, | ||||
|         sparse_linear=SparseLinear(nO=nO), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatBOW.v3") | ||||
| def build_bow_text_classifier_v3( | ||||
|     exclusive_classes: bool, | ||||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     length: int = 262144, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     if length < 1: | ||||
|         raise ValueError(Errors.E1056.format(length=length)) | ||||
| 
 | ||||
|     # Find k such that 2**(k-1) < length <= 2**k. | ||||
|     length = 2 ** (length - 1).bit_length() | ||||
| 
 | ||||
|     return _build_bow_text_classifier( | ||||
|         exclusive_classes=exclusive_classes, | ||||
|         ngram_size=ngram_size, | ||||
|         no_output_layer=no_output_layer, | ||||
|         nO=nO, | ||||
|         sparse_linear=SparseLinear_v2(nO=nO, length=length), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def _build_bow_text_classifier( | ||||
|     exclusive_classes: bool, | ||||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         sparse_linear = SparseLinear(nO=nO) | ||||
|         output_layer = None | ||||
|         if not no_output_layer: | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|  | @ -127,6 +148,9 @@ def build_text_classifier_v2( | |||
|     linear_model: Model[List[Doc], Floats2d], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     # TODO: build the model with _build_parametric_attention_with_residual_nonlinear | ||||
|     # in spaCy v4. We don't do this in spaCy v3 to preserve model | ||||
|     # compatibility. | ||||
|     exclusive_classes = not linear_model.attrs["multi_label"] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate}): | ||||
|         width = tok2vec.maybe_get_dim("nO") | ||||
|  | @ -190,3 +214,145 @@ def build_text_classifier_lowdata( | |||
|             model = model >> Dropout(dropout) | ||||
|         model = model >> Logistic() | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatParametricAttention.v1") | ||||
| def build_textcat_parametric_attention_v1( | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]], | ||||
|     exclusive_classes: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     width = tok2vec.maybe_get_dim("nO") | ||||
|     parametric_attention = _build_parametric_attention_with_residual_nonlinear( | ||||
|         tok2vec=tok2vec, | ||||
|         nonlinear_layer=Maxout(nI=width, nO=width), | ||||
|         key_transform=Gelu(nI=width, nO=width), | ||||
|     ) | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO) | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO) >> Logistic() | ||||
|         model = parametric_attention >> output_layer | ||||
|     if model.has_dim("nO") is not False and nO is not None: | ||||
|         model.set_dim("nO", cast(int, nO)) | ||||
|     model.set_ref("output_layer", output_layer) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
| 
 | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| def _build_parametric_attention_with_residual_nonlinear( | ||||
|     *, | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]], | ||||
|     nonlinear_layer: Model[Floats2d, Floats2d], | ||||
|     key_transform: Optional[Model[Floats2d, Floats2d]] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     with Model.define_operators({">>": chain, "|": concatenate}): | ||||
|         width = tok2vec.maybe_get_dim("nO") | ||||
|         attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform) | ||||
|         norm_layer = LayerNorm(nI=width) | ||||
|         parametric_attention = ( | ||||
|             tok2vec | ||||
|             >> list2ragged() | ||||
|             >> attention_layer | ||||
|             >> reduce_sum() | ||||
|             >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0)) | ||||
|         ) | ||||
| 
 | ||||
|         parametric_attention.init = _init_parametric_attention_with_residual_nonlinear | ||||
| 
 | ||||
|         parametric_attention.set_ref("tok2vec", tok2vec) | ||||
|         parametric_attention.set_ref("attention_layer", attention_layer) | ||||
|         parametric_attention.set_ref("nonlinear_layer", nonlinear_layer) | ||||
|         parametric_attention.set_ref("norm_layer", norm_layer) | ||||
| 
 | ||||
|         return parametric_attention | ||||
| 
 | ||||
| 
 | ||||
| def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: | ||||
|     tok2vec_width = get_tok2vec_width(model) | ||||
|     model.get_ref("attention_layer").set_dim("nO", tok2vec_width) | ||||
|     model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) | ||||
|     model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width) | ||||
|     model.get_ref("norm_layer").set_dim("nI", tok2vec_width) | ||||
|     model.get_ref("norm_layer").set_dim("nO", tok2vec_width) | ||||
|     init_chain(model, X, Y) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures("spacy.TextCatReduce.v1") | ||||
| def build_reduce_text_classifier( | ||||
|     tok2vec: Model, | ||||
|     exclusive_classes: bool, | ||||
|     use_reduce_first: bool, | ||||
|     use_reduce_last: bool, | ||||
|     use_reduce_max: bool, | ||||
|     use_reduce_mean: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     """Build a model that classifies pooled `Doc` representations. | ||||
| 
 | ||||
|     Pooling is performed using reductions. Reductions are concatenated when | ||||
|     multiple reductions are used. | ||||
| 
 | ||||
|     tok2vec (Model): the tok2vec layer to pool over. | ||||
|     exclusive_classes (bool): Whether or not classes are mutually exclusive. | ||||
|     use_reduce_first (bool): Pool by using the hidden representation of the | ||||
|         first token of a `Doc`. | ||||
|     use_reduce_last (bool): Pool by using the hidden representation of the | ||||
|         last token of a `Doc`. | ||||
|     use_reduce_max (bool): Pool by taking the maximum values of the hidden | ||||
|         representations of a `Doc`. | ||||
|     use_reduce_mean (bool): Pool by taking the mean of all hidden | ||||
|         representations of a `Doc`. | ||||
|     nO (Optional[int]): Number of classes. | ||||
|     """ | ||||
| 
 | ||||
|     fill_defaults = {"b": 0, "W": 0} | ||||
|     reductions = [] | ||||
|     if use_reduce_first: | ||||
|         reductions.append(reduce_first()) | ||||
|     if use_reduce_last: | ||||
|         reductions.append(reduce_last()) | ||||
|     if use_reduce_max: | ||||
|         reductions.append(reduce_max()) | ||||
|     if use_reduce_mean: | ||||
|         reductions.append(reduce_mean()) | ||||
| 
 | ||||
|     if not len(reductions): | ||||
|         raise ValueError(Errors.E1057) | ||||
| 
 | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         cnn = tok2vec >> list2ragged() >> concatenate(*reductions) | ||||
|         nO_tok2vec = tok2vec.maybe_get_dim("nO") | ||||
|         nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=nI) | ||||
|             fill_defaults["b"] = NEG_VALUE | ||||
|             resizable_layer: Model = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO, nI=nI) | ||||
|             resizable_layer = resizable( | ||||
|                 output_layer, | ||||
|                 resize_layer=partial( | ||||
|                     resize_linear_weighted, fill_defaults=fill_defaults | ||||
|                 ), | ||||
|             ) | ||||
|             model = cnn >> resizable_layer >> Logistic() | ||||
|         model.set_ref("output_layer", output_layer) | ||||
|         model.attrs["resize_output"] = partial( | ||||
|             resize_and_set_ref, | ||||
|             resizable_layer=resizable_layer, | ||||
|         ) | ||||
|     model.set_ref("tok2vec", tok2vec) | ||||
|     if nO is not None: | ||||
|         model.set_dim("nO", cast(int, nO)) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
|     return model | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ from .trainable_pipe import TrainablePipe | |||
| __all__ = [ | ||||
|     "AttributeRuler", | ||||
|     "DependencyParser", | ||||
|     "EditTreeLemmatizer", | ||||
|     "EntityLinker", | ||||
|     "EntityRecognizer", | ||||
|     "EntityRuler", | ||||
|  |  | |||
|  | @ -29,7 +29,7 @@ cdef class StateClass: | |||
|         return [self.B(i) for i in range(self.c.buffer_length())] | ||||
| 
 | ||||
|     @property | ||||
|     def token_vector_lenth(self): | ||||
|     def token_vector_length(self): | ||||
|         return self.doc.tensor.shape[1] | ||||
| 
 | ||||
|     @property | ||||
|  |  | |||
|  | @ -36,8 +36,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  | @ -45,16 +46,21 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m | |||
| 
 | ||||
| single_label_bow_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
| 
 | ||||
| single_label_cnn_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatCNN.v2" | ||||
| @architectures = "spacy.TextCatReduce.v1" | ||||
| exclusive_classes = true | ||||
| use_reduce_first = false | ||||
| use_reduce_last = false | ||||
| use_reduce_max = false | ||||
| use_reduce_mean = true | ||||
| 
 | ||||
| [model.tok2vec] | ||||
| @architectures = "spacy.HashEmbedCNN.v2" | ||||
|  |  | |||
|  | @ -35,8 +35,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| """ | ||||
|  | @ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod | |||
| 
 | ||||
| multi_label_bow_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
|  | @ -52,8 +53,12 @@ no_output_layer = false | |||
| 
 | ||||
| multi_label_cnn_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatCNN.v2" | ||||
| @architectures = "spacy.TextCatReduce.v1" | ||||
| exclusive_classes = false | ||||
| use_reduce_first = false | ||||
| use_reduce_last = false | ||||
| use_reduce_max = false | ||||
| use_reduce_mean = true | ||||
| 
 | ||||
| [model.tok2vec] | ||||
| @architectures = "spacy.HashEmbedCNN.v2" | ||||
|  |  | |||
							
								
								
									
										138
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						|  | @ -802,6 +802,140 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: | |||
|         } | ||||
| 
 | ||||
| 
 | ||||
| # The following implementation of trapezoid() is adapted from SciPy, | ||||
| # which is distributed under the New BSD License. | ||||
| # Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers. | ||||
| # See licenses/3rd_party_licenses.txt | ||||
| def trapezoid(y, x=None, dx=1.0, axis=-1): | ||||
|     r""" | ||||
|     Integrate along the given axis using the composite trapezoidal rule. | ||||
| 
 | ||||
|     If `x` is provided, the integration happens in sequence along its | ||||
|     elements - they are not sorted. | ||||
| 
 | ||||
|     Integrate `y` (`x`) along each 1d slice on the given axis, compute | ||||
|     :math:`\int y(x) dx`. | ||||
|     When `x` is specified, this integrates along the parametric curve, | ||||
|     computing :math:`\int_t y(t) dt = | ||||
|     \int_t y(t) \left.\frac{dx}{dt}\right|_{x=x(t)} dt`. | ||||
| 
 | ||||
|     Parameters | ||||
|     ---------- | ||||
|     y : array_like | ||||
|         Input array to integrate. | ||||
|     x : array_like, optional | ||||
|         The sample points corresponding to the `y` values. If `x` is None, | ||||
|         the sample points are assumed to be evenly spaced `dx` apart. The | ||||
|         default is None. | ||||
|     dx : scalar, optional | ||||
|         The spacing between sample points when `x` is None. The default is 1. | ||||
|     axis : int, optional | ||||
|         The axis along which to integrate. | ||||
| 
 | ||||
|     Returns | ||||
|     ------- | ||||
|     trapezoid : float or ndarray | ||||
|         Definite integral of `y` = n-dimensional array as approximated along | ||||
|         a single axis by the trapezoidal rule. If `y` is a 1-dimensional array, | ||||
|         then the result is a float. If `n` is greater than 1, then the result | ||||
|         is an `n`-1 dimensional array. | ||||
| 
 | ||||
|     See Also | ||||
|     -------- | ||||
|     cumulative_trapezoid, simpson, romb | ||||
| 
 | ||||
|     Notes | ||||
|     ----- | ||||
|     Image [2]_ illustrates trapezoidal rule -- y-axis locations of points | ||||
|     will be taken from `y` array, by default x-axis distances between | ||||
|     points will be 1.0, alternatively they can be provided with `x` array | ||||
|     or with `dx` scalar.  Return value will be equal to combined area under | ||||
|     the red lines. | ||||
| 
 | ||||
|     References | ||||
|     ---------- | ||||
|     .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule | ||||
| 
 | ||||
|     .. [2] Illustration image: | ||||
|            https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png | ||||
| 
 | ||||
|     Examples | ||||
|     -------- | ||||
|     Use the trapezoidal rule on evenly spaced points: | ||||
| 
 | ||||
|     >>> import numpy as np | ||||
|     >>> from scipy import integrate | ||||
|     >>> integrate.trapezoid([1, 2, 3]) | ||||
|     4.0 | ||||
| 
 | ||||
|     The spacing between sample points can be selected by either the | ||||
|     ``x`` or ``dx`` arguments: | ||||
| 
 | ||||
|     >>> integrate.trapezoid([1, 2, 3], x=[4, 6, 8]) | ||||
|     8.0 | ||||
|     >>> integrate.trapezoid([1, 2, 3], dx=2) | ||||
|     8.0 | ||||
| 
 | ||||
|     Using a decreasing ``x`` corresponds to integrating in reverse: | ||||
| 
 | ||||
|     >>> integrate.trapezoid([1, 2, 3], x=[8, 6, 4]) | ||||
|     -8.0 | ||||
| 
 | ||||
|     More generally ``x`` is used to integrate along a parametric curve. We can | ||||
|     estimate the integral :math:`\int_0^1 x^2 = 1/3` using: | ||||
| 
 | ||||
|     >>> x = np.linspace(0, 1, num=50) | ||||
|     >>> y = x**2 | ||||
|     >>> integrate.trapezoid(y, x) | ||||
|     0.33340274885464394 | ||||
| 
 | ||||
|     Or estimate the area of a circle, noting we repeat the sample which closes | ||||
|     the curve: | ||||
| 
 | ||||
|     >>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True) | ||||
|     >>> integrate.trapezoid(np.cos(theta), x=np.sin(theta)) | ||||
|     3.141571941375841 | ||||
| 
 | ||||
|     ``trapezoid`` can be applied along a specified axis to do multiple | ||||
|     computations in one call: | ||||
| 
 | ||||
|     >>> a = np.arange(6).reshape(2, 3) | ||||
|     >>> a | ||||
|     array([[0, 1, 2], | ||||
|            [3, 4, 5]]) | ||||
|     >>> integrate.trapezoid(a, axis=0) | ||||
|     array([1.5, 2.5, 3.5]) | ||||
|     >>> integrate.trapezoid(a, axis=1) | ||||
|     array([2.,  8.]) | ||||
|     """ | ||||
|     y = np.asanyarray(y) | ||||
|     if x is None: | ||||
|         d = dx | ||||
|     else: | ||||
|         x = np.asanyarray(x) | ||||
|         if x.ndim == 1: | ||||
|             d = np.diff(x) | ||||
|             # reshape to correct shape | ||||
|             shape = [1] * y.ndim | ||||
|             shape[axis] = d.shape[0] | ||||
|             d = d.reshape(shape) | ||||
|         else: | ||||
|             d = np.diff(x, axis=axis) | ||||
|     nd = y.ndim | ||||
|     slice1 = [slice(None)] * nd | ||||
|     slice2 = [slice(None)] * nd | ||||
|     slice1[axis] = slice(1, None) | ||||
|     slice2[axis] = slice(None, -1) | ||||
|     try: | ||||
|         ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis) | ||||
|     except ValueError: | ||||
|         # Operations didn't work, cast to ndarray | ||||
|         d = np.asarray(d) | ||||
|         y = np.asarray(y) | ||||
|         ret = np.add.reduce(d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0, axis) | ||||
|     return ret | ||||
| 
 | ||||
| 
 | ||||
| # The following implementation of roc_auc_score() is adapted from | ||||
| # scikit-learn, which is distributed under the New BSD License. | ||||
| # Copyright (c) 2007–2019 The scikit-learn developers. | ||||
|  | @ -1024,9 +1158,9 @@ def _auc(x, y): | |||
|         else: | ||||
|             raise ValueError(Errors.E164.format(x=x)) | ||||
| 
 | ||||
|     area = direction * np.trapz(y, x) | ||||
|     area = direction * trapezoid(y, x) | ||||
|     if isinstance(area, np.memmap): | ||||
|         # Reductions such as .sum used internally in np.trapz do not return a | ||||
|         # Reductions such as .sum used internally in trapezoid do not return a | ||||
|         # scalar by default for numpy.memmap instances contrary to | ||||
|         # regular numpy.ndarray instances. | ||||
|         area = area.dtype.type(area) | ||||
|  |  | |||
|  | @ -162,6 +162,11 @@ def fi_tokenizer(): | |||
|     return get_lang_class("fi")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def fo_tokenizer(): | ||||
|     return get_lang_class("fo")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def fr_tokenizer(): | ||||
|     return get_lang_class("fr")().tokenizer | ||||
|  | @ -317,6 +322,11 @@ def nl_tokenizer(): | |||
|     return get_lang_class("nl")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def nn_tokenizer(): | ||||
|     return get_lang_class("nn")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def pl_tokenizer(): | ||||
|     return get_lang_class("pl")().tokenizer | ||||
|  |  | |||
|  | @ -731,3 +731,12 @@ def test_for_no_ent_sents(): | |||
|     sents = list(doc.ents[0].sents) | ||||
|     assert len(sents) == 1 | ||||
|     assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" | ||||
| 
 | ||||
| 
 | ||||
| def test_span_api_richcmp_other(en_tokenizer): | ||||
|     doc1 = en_tokenizer("a b") | ||||
|     doc2 = en_tokenizer("b c") | ||||
|     assert not doc1[1:2] == doc1[1] | ||||
|     assert not doc1[1:2] == doc2[0] | ||||
|     assert not doc1[1:2] == doc2[0:1] | ||||
|     assert not doc1[0:1] == doc2 | ||||
|  |  | |||
|  | @ -294,3 +294,12 @@ def test_missing_head_dep(en_vocab): | |||
|     assert aligned_heads[0] == ref_heads[0] | ||||
|     assert aligned_deps[5] == ref_deps[5] | ||||
|     assert aligned_heads[5] == ref_heads[5] | ||||
| 
 | ||||
| 
 | ||||
| def test_token_api_richcmp_other(en_tokenizer): | ||||
|     doc1 = en_tokenizer("a b") | ||||
|     doc2 = en_tokenizer("b c") | ||||
|     assert not doc1[1] == doc1[0:1] | ||||
|     assert not doc1[1] == doc2[1:2] | ||||
|     assert not doc1[1] == doc2[0] | ||||
|     assert not doc1[0] == doc2 | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										26
									
								
								spacy/tests/lang/fo/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,26 @@ | |||
| import pytest | ||||
| 
 | ||||
| # examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/) | ||||
| # fmt: off | ||||
| FO_TOKEN_EXCEPTION_TESTS = [ | ||||
|     ( | ||||
|         "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", | ||||
|         [ | ||||
|             "Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.", | ||||
|         [ | ||||
|             "Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".", | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS) | ||||
| def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens): | ||||
|     tokens = fo_tokenizer(text) | ||||
|     token_list = [token.text for token in tokens if not token.is_space] | ||||
|     assert expected_tokens == token_list | ||||
							
								
								
									
										0
									
								
								spacy/tests/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										38
									
								
								spacy/tests/lang/nn/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,38 @@ | |||
| import pytest | ||||
| 
 | ||||
| # examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) | ||||
| # fmt: off | ||||
| NN_TOKEN_EXCEPTION_TESTS = [ | ||||
|     ( | ||||
|         "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", | ||||
|         [ | ||||
|             "Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.", | ||||
|         [ | ||||
|             "Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.", | ||||
|         [ | ||||
|             "Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".", | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         "Brukssesongen er frå nov. til mai, med ein topp i mars.", | ||||
|         [ | ||||
|             "Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".", | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS) | ||||
| def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens): | ||||
|     tokens = nn_tokenizer(text) | ||||
|     token_list = [token.text for token in tokens if not token.is_space] | ||||
|     assert expected_tokens == token_list | ||||
|  | @ -203,7 +203,7 @@ def test_pipe_class_component_model(): | |||
|             "@architectures": "spacy.TextCatEnsemble.v2", | ||||
|             "tok2vec": DEFAULT_TOK2VEC_MODEL, | ||||
|             "linear_model": { | ||||
|                 "@architectures": "spacy.TextCatBOW.v2", | ||||
|                 "@architectures": "spacy.TextCatBOW.v3", | ||||
|                 "exclusive_classes": False, | ||||
|                 "ngram_size": 1, | ||||
|                 "no_output_layer": False, | ||||
|  |  | |||
|  | @ -414,7 +414,7 @@ def test_implicit_label(name, get_examples): | |||
| @pytest.mark.parametrize( | ||||
|     "name,textcat_config", | ||||
|     [ | ||||
|         # BOW | ||||
|         # BOW V1 | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|  | @ -451,14 +451,14 @@ def test_no_resize(name, textcat_config): | |||
| @pytest.mark.parametrize( | ||||
|     "name,textcat_config", | ||||
|     [ | ||||
|         # BOW | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # BOW V3 | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # CNN | ||||
|         ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
|  | @ -480,14 +480,14 @@ def test_resize(name, textcat_config): | |||
| @pytest.mark.parametrize( | ||||
|     "name,textcat_config", | ||||
|     [ | ||||
|         # BOW | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # CNN | ||||
|         ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|         # BOW v3 | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||
|         # REDUCE | ||||
|         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
|  | @ -693,12 +693,23 @@ def test_overfitting_IO_multi(): | |||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), | ||||
|         # BOW V3 | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), | ||||
|         # ENSEMBLE V2 | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), | ||||
|         # CNN V2 | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), | ||||
|         # CNN V2 (legacy) | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|         # PARAMETRIC ATTENTION V1 | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||
|         # REDUCE V1 | ||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
|  |  | |||
|  | @ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer): | |||
|     Token.set_extension("_test_token", default="t0") | ||||
|     doc[1]._._test_token = "t1" | ||||
| 
 | ||||
|     return doc | ||||
|     yield doc | ||||
| 
 | ||||
|     Doc.remove_extension("_test_attr") | ||||
|     Doc.remove_extension("_test_prop") | ||||
|     Doc.remove_extension("_test_method") | ||||
|     Token.remove_extension("_test_token") | ||||
| 
 | ||||
| 
 | ||||
| def test_serialize_ext_attrs_from_bytes(doc_w_attrs): | ||||
|  |  | |||
|  | @ -1061,3 +1061,8 @@ def test_debug_data_trainable_lemmatizer_not_annotated(): | |||
| 
 | ||||
|     data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) | ||||
|     assert data["no_lemma_annotations"] == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_project_api_imports(): | ||||
|     from spacy.cli import project_run | ||||
|     from spacy.cli.project.run import project_run  # noqa: F401, F811 | ||||
|  |  | |||
|  | @ -214,9 +214,6 @@ def test_project_clone(options): | |||
|         assert (out / "README.md").is_file() | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skipif( | ||||
|     sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes" | ||||
| ) | ||||
| def test_project_push_pull(project_dir): | ||||
|     proj = dict(SAMPLE_PROJECT) | ||||
|     remote = "xyz" | ||||
|  | @ -241,7 +238,7 @@ def test_project_push_pull(project_dir): | |||
| 
 | ||||
| def test_find_function_valid(): | ||||
|     # example of architecture in main code base | ||||
|     function = "spacy.TextCatBOW.v2" | ||||
|     function = "spacy.TextCatBOW.v3" | ||||
|     result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) | ||||
|     assert f"Found registered function '{function}'" in result.stdout | ||||
|     assert "textcat.py" in result.stdout | ||||
|  | @ -260,7 +257,7 @@ def test_find_function_valid(): | |||
| 
 | ||||
| def test_find_function_invalid(): | ||||
|     # invalid registry | ||||
|     function = "spacy.TextCatBOW.v2" | ||||
|     function = "spacy.TextCatBOW.v3" | ||||
|     registry = "foobar" | ||||
|     result = CliRunner().invoke( | ||||
|         app, ["find-function", function, "--registry", registry] | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ import numpy | |||
| import pytest | ||||
| 
 | ||||
| from spacy import displacy | ||||
| from spacy.displacy.render import DependencyRenderer, EntityRenderer | ||||
| from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.fa import Persian | ||||
| from spacy.tokens import Doc, Span | ||||
|  | @ -468,3 +468,23 @@ def test_issue12816(en_vocab) -> None: | |||
|     # Verify that the HTML tag is still escaped | ||||
|     html = displacy.render(doc, style="span") | ||||
|     assert "<TEST>" in html | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(13056) | ||||
| def test_displacy_span_stacking(): | ||||
|     """Test whether span stacking works properly for multiple overlapping spans.""" | ||||
|     spans = [ | ||||
|         {"start_token": 2, "end_token": 5, "label": "SkillNC"}, | ||||
|         {"start_token": 0, "end_token": 2, "label": "Skill"}, | ||||
|         {"start_token": 1, "end_token": 3, "label": "Skill"}, | ||||
|     ] | ||||
|     tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."] | ||||
|     per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens) | ||||
| 
 | ||||
|     assert len(per_token_info) == len(tokens) | ||||
|     assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)]) | ||||
|     assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)]) | ||||
|     assert per_token_info[1]["entities"][0]["render_slot"] == 1 | ||||
|     assert per_token_info[1]["entities"][1]["render_slot"] == 2 | ||||
|     assert per_token_info[2]["entities"][0]["render_slot"] == 2 | ||||
|     assert per_token_info[2]["entities"][1]["render_slot"] == 3 | ||||
|  |  | |||
|  | @ -376,8 +376,9 @@ def test_util_dot_section(): | |||
|     factory = "textcat" | ||||
| 
 | ||||
|     [components.textcat.model] | ||||
|     @architectures = "spacy.TextCatBOW.v2" | ||||
|     @architectures = "spacy.TextCatBOW.v3" | ||||
|     exclusive_classes = true | ||||
|     length = 262144 | ||||
|     ngram_size = 1 | ||||
|     no_output_layer = false | ||||
|     """ | ||||
|  | @ -485,8 +486,8 @@ def test_to_ternary_int(): | |||
| 
 | ||||
| def test_find_available_port(): | ||||
|     host = "0.0.0.0" | ||||
|     port = 5000 | ||||
|     assert find_available_port(port, host) == port, "Port 5000 isn't free" | ||||
|     port = 5001 | ||||
|     assert find_available_port(port, host) == port, "Port 5001 isn't free" | ||||
| 
 | ||||
|     from wsgiref.simple_server import demo_app, make_server | ||||
| 
 | ||||
|  |  | |||
|  | @ -26,6 +26,7 @@ from spacy.ml.models import ( | |||
|     build_Tok2Vec_model, | ||||
| ) | ||||
| from spacy.ml.staticvectors import StaticVectors | ||||
| from spacy.util import registry | ||||
| 
 | ||||
| 
 | ||||
| def get_textcat_bow_kwargs(): | ||||
|  | @ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5): | |||
|     Y, backprop = model((docs, spans), is_train=True) | ||||
|     assert Y.shape == (spans.dataXd.shape[0], nO) | ||||
|     backprop(Y) | ||||
| 
 | ||||
| 
 | ||||
| def test_textcat_reduce_invalid_args(): | ||||
|     textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1") | ||||
|     tok2vec = make_test_tok2vec() | ||||
|     with pytest.raises(ValueError, match=r"must be used with at least one reduction"): | ||||
|         textcat_reduce( | ||||
|             tok2vec=tok2vec, | ||||
|             exclusive_classes=False, | ||||
|             use_reduce_first=False, | ||||
|             use_reduce_last=False, | ||||
|             use_reduce_max=False, | ||||
|             use_reduce_mean=False, | ||||
|         ) | ||||
|  |  | |||
|  | @ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab): | |||
|     assert tokens == explain_tokens | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_explain_special_matcher_whitespace(en_vocab): | ||||
|     rules = {":]": [{"ORTH": ":]"}]} | ||||
|     tokenizer = Tokenizer( | ||||
|         en_vocab, | ||||
|         rules=rules, | ||||
|     ) | ||||
|     text = ": ]" | ||||
|     tokens = [t.text for t in tokenizer(text)] | ||||
|     explain_tokens = [t[1] for t in tokenizer.explain(text)] | ||||
|     assert tokens == explain_tokens | ||||
| 
 | ||||
| 
 | ||||
| @hypothesis.strategies.composite | ||||
| def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: | ||||
|     """ | ||||
|  | @ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None: | |||
|     """ | ||||
| 
 | ||||
|     tokenizer: Tokenizer = spacy.blank(lang).tokenizer | ||||
|     tokens = [t.text for t in tokenizer(sentence) if not t.is_space] | ||||
|     # Tokenizer.explain is not intended to handle whitespace or control | ||||
|     # characters in the same way as Tokenizer | ||||
|     sentence = re.sub(r"\s+", " ", sentence).strip() | ||||
|     tokens = [t.text for t in tokenizer(sentence)] | ||||
|     debug_tokens = [t[1] for t in tokenizer.explain(sentence)] | ||||
|     assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" | ||||
|  |  | |||
|  | @ -730,9 +730,16 @@ cdef class Tokenizer: | |||
|             if i in spans_by_start: | ||||
|                 span = spans_by_start[i] | ||||
|                 exc = [d[ORTH] for d in special_cases[span.label_]] | ||||
|                 for j, orth in enumerate(exc): | ||||
|                     final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) | ||||
|                 i += len(span) | ||||
|                 # The phrase matcher can overmatch for tokens separated by | ||||
|                 # spaces in the text but not in the underlying rule, so skip | ||||
|                 # cases where the texts aren't identical | ||||
|                 if span.text != "".join([self.vocab.strings[orth] for orth in exc]): | ||||
|                     final_tokens.append(tokens[i]) | ||||
|                     i += 1 | ||||
|                 else: | ||||
|                     for j, orth in enumerate(exc): | ||||
|                         final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) | ||||
|                     i += len(span) | ||||
|             else: | ||||
|                 final_tokens.append(tokens[i]) | ||||
|                 i += 1 | ||||
|  |  | |||
|  | @ -5,4 +5,4 @@ from .span import Span | |||
| from .span_group import SpanGroup | ||||
| from .token import Token | ||||
| 
 | ||||
| __all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"] | ||||
| __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"] | ||||
|  |  | |||
|  | @ -42,7 +42,7 @@ class Doc: | |||
|     user_hooks: Dict[str, Callable[..., Any]] | ||||
|     user_token_hooks: Dict[str, Callable[..., Any]] | ||||
|     user_span_hooks: Dict[str, Callable[..., Any]] | ||||
|     tensor: np.ndarray[Any, np.dtype[np.float_]] | ||||
|     tensor: np.ndarray[Any, np.dtype[np.float64]] | ||||
|     user_data: Dict[str, Any] | ||||
|     has_unknown_spaces: bool | ||||
|     _context: Any | ||||
|  | @ -125,7 +125,7 @@ class Doc: | |||
|         vector: Optional[Floats1d] = ..., | ||||
|         alignment_mode: str = ..., | ||||
|         span_id: Union[int, str] = ..., | ||||
|     ) -> Span: ... | ||||
|     ) -> Optional[Span]: ... | ||||
|     def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... | ||||
|     @property | ||||
|     def has_vector(self) -> bool: ... | ||||
|  | @ -166,7 +166,7 @@ class Doc: | |||
|     ) -> Doc: ... | ||||
|     def to_array( | ||||
|         self, py_attr_ids: Union[int, str, List[Union[int, str]]] | ||||
|     ) -> np.ndarray[Any, np.dtype[np.float_]]: ... | ||||
|     ) -> np.ndarray[Any, np.dtype[np.float64]]: ... | ||||
|     @staticmethod | ||||
|     def from_docs( | ||||
|         docs: List[Doc], | ||||
|  | @ -179,15 +179,13 @@ class Doc: | |||
|         self, path: Union[str, Path], *, exclude: Iterable[str] = ... | ||||
|     ) -> None: ... | ||||
|     def from_disk( | ||||
|         self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ... | ||||
|         self, path: Union[str, Path], *, exclude: Iterable[str] = ... | ||||
|     ) -> Doc: ... | ||||
|     def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... | ||||
|     def from_bytes( | ||||
|         self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ... | ||||
|     ) -> Doc: ... | ||||
|     def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... | ||||
|     def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ... | ||||
|     def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ... | ||||
|     def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ... | ||||
|     def from_dict( | ||||
|         self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ... | ||||
|         self, msg: Dict[str, Any], *, exclude: Iterable[str] = ... | ||||
|     ) -> Doc: ... | ||||
|     def extend_tensor(self, tensor: Floats2d) -> None: ... | ||||
|     def retokenize(self) -> Retokenizer: ... | ||||
|  |  | |||
|  | @ -1326,7 +1326,7 @@ cdef class Doc: | |||
| 
 | ||||
|         path (str / Path): A path to a directory. Paths may be either | ||||
|             strings or `Path`-like objects. | ||||
|         exclude (list): String names of serialization fields to exclude. | ||||
|         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||
|         RETURNS (Doc): The modified `Doc` object. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_disk | ||||
|  | @ -1339,7 +1339,7 @@ cdef class Doc: | |||
|     def to_bytes(self, *, exclude=tuple()): | ||||
|         """Serialize, i.e. export the document contents to a binary string. | ||||
| 
 | ||||
|         exclude (list): String names of serialization fields to exclude. | ||||
|         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||
|         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including | ||||
|             all annotations. | ||||
| 
 | ||||
|  | @ -1351,7 +1351,7 @@ cdef class Doc: | |||
|         """Deserialize, i.e. import the document contents from a binary string. | ||||
| 
 | ||||
|         data (bytes): The string to load from. | ||||
|         exclude (list): String names of serialization fields to exclude. | ||||
|         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||
|         RETURNS (Doc): Itself. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_bytes | ||||
|  | @ -1361,11 +1361,8 @@ cdef class Doc: | |||
|     def to_dict(self, *, exclude=tuple()): | ||||
|         """Export the document contents to a dictionary for serialization. | ||||
| 
 | ||||
|         exclude (list): String names of serialization fields to exclude. | ||||
|         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including | ||||
|             all annotations. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#to_bytes | ||||
|         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||
|         RETURNS (Dict[str, Any]): A dictionary representation of the `Doc` | ||||
|         """ | ||||
|         array_head = Doc._get_array_attrs() | ||||
|         strings = set() | ||||
|  | @ -1411,13 +1408,11 @@ cdef class Doc: | |||
|         return util.to_dict(serializers, exclude) | ||||
| 
 | ||||
|     def from_dict(self, msg, *, exclude=tuple()): | ||||
|         """Deserialize, i.e. import the document contents from a binary string. | ||||
|         """Deserialize the document contents from a dictionary representation. | ||||
| 
 | ||||
|         data (bytes): The string to load from. | ||||
|         exclude (list): String names of serialization fields to exclude. | ||||
|         msg (Dict[str, Any]): The dictionary to load from. | ||||
|         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||
|         RETURNS (Doc): Itself. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#from_dict | ||||
|         """ | ||||
|         if self.length != 0: | ||||
|             raise ValueError(Errors.E033.format(length=self.length)) | ||||
|  |  | |||
|  | @ -127,14 +127,17 @@ cdef class Span: | |||
|         self._vector = vector | ||||
|         self._vector_norm = vector_norm | ||||
| 
 | ||||
|     def __richcmp__(self, Span other, int op): | ||||
|     def __richcmp__(self, object other, int op): | ||||
|         if other is None: | ||||
|             if op == 0 or op == 1 or op == 2: | ||||
|                 return False | ||||
|             else: | ||||
|                 return True | ||||
|         if not isinstance(other, Span): | ||||
|             return False | ||||
|         cdef Span other_span = other | ||||
|         self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc) | ||||
|         other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc) | ||||
|         other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc) | ||||
|         # < | ||||
|         if op == 0: | ||||
|             return self_tuple < other_tuple | ||||
|  |  | |||
|  | @ -53,7 +53,12 @@ class Token: | |||
|     def __bytes__(self) -> bytes: ... | ||||
|     def __str__(self) -> str: ... | ||||
|     def __repr__(self) -> str: ... | ||||
|     def __richcmp__(self, other: Token, op: int) -> bool: ... | ||||
|     def __lt__(self, other: Any) -> bool: ... | ||||
|     def __le__(self, other: Any) -> bool: ... | ||||
|     def __eq__(self, other: Any) -> bool: ... | ||||
|     def __ne__(self, other: Any) -> bool: ... | ||||
|     def __gt__(self, other: Any) -> bool: ... | ||||
|     def __ge__(self, other: Any) -> bool: ... | ||||
|     @property | ||||
|     def _(self) -> Underscore: ... | ||||
|     def nbor(self, i: int = ...) -> Token: ... | ||||
|  |  | |||
|  | @ -139,17 +139,20 @@ cdef class Token: | |||
|     def __repr__(self): | ||||
|         return self.__str__() | ||||
| 
 | ||||
|     def __richcmp__(self, Token other, int op): | ||||
|     def __richcmp__(self, object other, int op): | ||||
|         # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html | ||||
|         if other is None: | ||||
|             if op in (0, 1, 2): | ||||
|                 return False | ||||
|             else: | ||||
|                 return True | ||||
|         if not isinstance(other, Token): | ||||
|             return False | ||||
|         cdef Token other_token = other | ||||
|         cdef Doc my_doc = self.doc | ||||
|         cdef Doc other_doc = other.doc | ||||
|         cdef Doc other_doc = other_token.doc | ||||
|         my = self.idx | ||||
|         their = other.idx | ||||
|         their = other_token.idx | ||||
|         if op == 0: | ||||
|             return my < their | ||||
|         elif op == 2: | ||||
|  |  | |||
|  | @ -16,3 +16,28 @@ from .iob_utils import (  # noqa: F401 | |||
|     tags_to_entities, | ||||
| ) | ||||
| from .loggers import console_logger  # noqa: F401 | ||||
| 
 | ||||
| __all__ = [ | ||||
|     "Alignment", | ||||
|     "Corpus", | ||||
|     "Example", | ||||
|     "JsonlCorpus", | ||||
|     "PlainTextCorpus", | ||||
|     "biluo_tags_to_offsets", | ||||
|     "biluo_tags_to_spans", | ||||
|     "biluo_to_iob", | ||||
|     "create_copy_from_base_model", | ||||
|     "docs_to_json", | ||||
|     "dont_augment", | ||||
|     "iob_to_biluo", | ||||
|     "minibatch_by_padded_size", | ||||
|     "minibatch_by_words", | ||||
|     "offsets_to_biluo_tags", | ||||
|     "orth_variants_augmenter", | ||||
|     "read_json_file", | ||||
|     "remove_bilu_prefix", | ||||
|     "split_bilu_label", | ||||
|     "tags_to_entities", | ||||
|     "validate_get_examples", | ||||
|     "validate_examples", | ||||
| ] | ||||
|  |  | |||
|  | @ -1077,20 +1077,38 @@ def make_tempdir() -> Generator[Path, None, None]: | |||
| 
 | ||||
| 
 | ||||
| def is_in_jupyter() -> bool: | ||||
|     """Check if user is running spaCy from a Jupyter notebook by detecting the | ||||
|     IPython kernel. Mainly used for the displaCy visualizer. | ||||
|     RETURNS (bool): True if in Jupyter, False if not. | ||||
|     """Check if user is running spaCy from a Jupyter or Colab notebook by | ||||
|     detecting the IPython kernel. Mainly used for the displaCy visualizer. | ||||
|     RETURNS (bool): True if in Jupyter/Colab, False if not. | ||||
|     """ | ||||
|     # https://stackoverflow.com/a/39662359/6400719 | ||||
|     # https://stackoverflow.com/questions/15411967 | ||||
|     try: | ||||
|         shell = get_ipython().__class__.__name__  # type: ignore[name-defined] | ||||
|         if shell == "ZMQInteractiveShell": | ||||
|         if get_ipython().__class__.__name__ == "ZMQInteractiveShell":  # type: ignore[name-defined] | ||||
|             return True  # Jupyter notebook or qtconsole | ||||
|         if get_ipython().__class__.__module__ == "google.colab._shell":  # type: ignore[name-defined] | ||||
|             return True  # Colab notebook | ||||
|     except NameError: | ||||
|         return False  # Probably standard Python interpreter | ||||
|         pass  # Probably standard Python interpreter | ||||
|     # additional check for Colab | ||||
|     try: | ||||
|         import google.colab | ||||
| 
 | ||||
|         return True  # Colab notebook | ||||
|     except ImportError: | ||||
|         pass | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| def is_in_interactive() -> bool: | ||||
|     """Check if user is running spaCy from an interactive Python | ||||
|     shell. Will return True in Jupyter notebooks too. | ||||
|     RETURNS (bool): True if in interactive mode, False if not. | ||||
|     """ | ||||
|     # https://stackoverflow.com/questions/2356399/tell-if-python-is-in-interactive-mode | ||||
|     return hasattr(sys, "ps1") or hasattr(sys, "ps2") | ||||
| 
 | ||||
| 
 | ||||
| def get_object_name(obj: Any) -> str: | ||||
|     """Get a human-readable name of a Python object, e.g. a pipeline component. | ||||
| 
 | ||||
|  |  | |||
|  | @ -78,16 +78,16 @@ subword features, and a | |||
| [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer | ||||
| consisting of a CNN and a layer-normalized maxout activation function. | ||||
| 
 | ||||
| | Name                 | Description                                                                                                                                                                                                                                                                   | | ||||
| | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                          | | ||||
| | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                                | | ||||
| | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                            | | ||||
| | Name                 | Description                                                                                                                                                                                                                                                                 | | ||||
| | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                        | | ||||
| | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                              | | ||||
| | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                          | | ||||
| | `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | ||||
| | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                   | | ||||
| | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                       | | ||||
| | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  | | ||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        | | ||||
| | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                 | | ||||
| | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                     | | ||||
| | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                | | ||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                      | | ||||
| 
 | ||||
| ### spacy.Tok2VecListener.v1 {id="Tok2VecListener"} | ||||
| 
 | ||||
|  | @ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the | |||
| > nO = null | ||||
| > | ||||
| > [model.linear_model] | ||||
| > @architectures = "spacy.TextCatBOW.v2" | ||||
| > @architectures = "spacy.TextCatBOW.v3" | ||||
| > exclusive_classes = true | ||||
| > length = 262144 | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > | ||||
|  | @ -1017,54 +1018,15 @@ but used an internal `tok2vec` instead of taking it as argument: | |||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| ### spacy.TextCatCNN.v2 {id="TextCatCNN"} | ||||
| ### spacy.TextCatBOW.v3 {id="TextCatBOW"} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatCNN.v2" | ||||
| > exclusive_classes = false | ||||
| > nO = null | ||||
| > | ||||
| > [model.tok2vec] | ||||
| > @architectures = "spacy.HashEmbedCNN.v2" | ||||
| > pretrained_vectors = null | ||||
| > width = 96 | ||||
| > depth = 4 | ||||
| > embed_size = 2000 | ||||
| > window_size = 1 | ||||
| > maxout_pieces = 3 | ||||
| > subword_features = true | ||||
| > ``` | ||||
| 
 | ||||
| A neural network model where token vectors are calculated using a CNN. The | ||||
| vectors are mean pooled and used as features in a feed-forward network. This | ||||
| architecture is usually less accurate than the ensemble, but runs faster. | ||||
| 
 | ||||
| | Name                | Description                                                                                                                                                                                    | | ||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| <Accordion title="spacy.TextCatCNN.v1 definition" spaced> | ||||
| 
 | ||||
| [TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was | ||||
| not yet resizable. Since v2, new labels can be added to this component, even | ||||
| after training. | ||||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| ### spacy.TextCatBOW.v2 {id="TextCatBOW"} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatBOW.v2" | ||||
| > @architectures = "spacy.TextCatBOW.v3" | ||||
| > exclusive_classes = false | ||||
| > length = 262144 | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > nO = null | ||||
|  | @ -1078,17 +1040,108 @@ the others, but may not be as accurate, especially if texts are short. | |||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | ||||
| | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | ||||
| | `length`            | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~                                              | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| <Accordion title="spacy.TextCatBOW.v1 definition" spaced> | ||||
| <Accordion title="Previous versions of spacy.TextCatBOW" spaced> | ||||
| 
 | ||||
| [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was | ||||
| not yet resizable. Since v2, new labels can be added to this component, even | ||||
| after training. | ||||
| - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2, | ||||
|   new labels can be added to this component, even after training. | ||||
| - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and | ||||
|   [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear | ||||
|   layer that only used a small number of the allocated parameters. | ||||
| - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and | ||||
|   [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument. | ||||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| ### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatParametricAttention.v1" | ||||
| > exclusive_classes = true | ||||
| > nO = null | ||||
| > | ||||
| > [model.tok2vec] | ||||
| > @architectures = "spacy.Tok2Vec.v2" | ||||
| > | ||||
| > [model.tok2vec.embed] | ||||
| > @architectures = "spacy.MultiHashEmbed.v2" | ||||
| > width = 64 | ||||
| > rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||
| > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||
| > include_static_vectors = false | ||||
| > | ||||
| > [model.tok2vec.encode] | ||||
| > @architectures = "spacy.MaxoutWindowEncoder.v2" | ||||
| > width = ${model.tok2vec.embed.width} | ||||
| > window_size = 1 | ||||
| > maxout_pieces = 3 | ||||
| > depth = 2 | ||||
| > ``` | ||||
| 
 | ||||
| A neural network model that is built upon Tok2Vec and uses parametric attention | ||||
| to attend to tokens that are relevant to text classification. | ||||
| 
 | ||||
| | Name                | Description                                                                                                                                                                                    | | ||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     | | ||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ### spacy.TextCatReduce.v1 {id="TextCatReduce"} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatReduce.v1" | ||||
| > exclusive_classes = false | ||||
| > use_reduce_first = false | ||||
| > use_reduce_last = false | ||||
| > use_reduce_max = false | ||||
| > use_reduce_mean = true | ||||
| > nO = null | ||||
| > | ||||
| > [model.tok2vec] | ||||
| > @architectures = "spacy.HashEmbedCNN.v2" | ||||
| > pretrained_vectors = null | ||||
| > width = 96 | ||||
| > depth = 4 | ||||
| > embed_size = 2000 | ||||
| > window_size = 1 | ||||
| > maxout_pieces = 3 | ||||
| > subword_features = true | ||||
| > ``` | ||||
| 
 | ||||
| A classifier that pools token hidden representations of each `Doc` using first, | ||||
| max or mean reduction and then applies a classification layer. Reductions are | ||||
| concatenated when multiple reductions are used. | ||||
| 
 | ||||
| <Infobox variant="warning" title="Relation to TextCatCNN" id="TextCatCNN"> | ||||
| 
 | ||||
| `TextCatReduce` is a generalization of the older | ||||
| [`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean | ||||
| reduction, whereas `TextCatReduce` also supports first/max reductions. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| | Name                | Description                                                                                                                                                                                    | | ||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        | | ||||
| | `use_reduce_first`  | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~                                                                                                                | | ||||
| | `use_reduce_last`   | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~                                                                                                                 | | ||||
| | `use_reduce_max`    | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~                                                                                                           | | ||||
| | `use_reduce_mean`   | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~                                                                                                                     | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"} | ||||
| 
 | ||||
| ### spacy.SpanCategorizer.v1 {id="SpanCategorizer"} | ||||
|  |  | |||
|  | @ -1268,20 +1268,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is | |||
| warmed up before any measurements are taken. | ||||
| 
 | ||||
| ```cli | ||||
| $ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] | ||||
| $ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] | ||||
| ``` | ||||
| 
 | ||||
| | Name                 | Description                                                                                              | | ||||
| | -------------------- | -------------------------------------------------------------------------------------------------------- | | ||||
| | `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | | ||||
| | `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | | ||||
| | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~           | | ||||
| | `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                          | | ||||
| | `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                           | | ||||
| | `--batches`          | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~                         | | ||||
| | `--warmup`, `-w`     | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~               | | ||||
| | `--help`, `-h`       | Show help message and available arguments. ~~bool (flag)~~                                               | | ||||
| | **PRINTS**           | Pipeline speed in words per second with a 95% confidence interval.                                       | | ||||
| | Name                 | Description                                                                                                                                                                          | | ||||
| | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             | | ||||
| | `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             | | ||||
| | `--code`, `-c`       | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | ||||
| | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       | | ||||
| | `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      | | ||||
| | `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       | | ||||
| | `--batches`          | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~                                                                                                     | | ||||
| | `--warmup`, `-w`     | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~                                                                                           | | ||||
| | `--help`, `-h`       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           | | ||||
| | **PRINTS**           | Pipeline speed in words per second with a 95% confidence interval.                                                                                                                   | | ||||
| 
 | ||||
| ## apply {id="apply", version="3.5", tag="command"} | ||||
| 
 | ||||
|  | @ -1296,6 +1297,9 @@ input formats are: | |||
| 
 | ||||
| When a directory is provided it is traversed recursively to collect all files. | ||||
| 
 | ||||
| When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved. | ||||
| If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations. | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] | ||||
| ``` | ||||
|  |  | |||
|  | @ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned | |||
| to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension | ||||
| attribute. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Get the last hidden layer output for "is" (token index 1) | ||||
| > doc = nlp("This is a text.") | ||||
| > tensors = doc._.trf_data.last_hidden_layer_state[1] | ||||
| > ``` | ||||
| 
 | ||||
| | Name              | Description                                                                                                                                                                        | | ||||
| | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ | | ||||
|  |  | |||
|  | @ -20,10 +20,9 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible | |||
| through a generic `llm` | ||||
| [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories) | ||||
| as well as through task-specific component factories: `llm_ner`, `llm_spancat`, | ||||
| `llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and | ||||
| `llm_entity_linker`. | ||||
| 
 | ||||
| ### LLMWrapper.\_\_init\_\_ {id="init",tag="method"} | ||||
| `llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`, | ||||
| `llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the | ||||
| GPT-3-5 model from OpenAI is used by default, but this can be customized. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -33,13 +32,18 @@ as well as through task-specific component factories: `llm_ner`, `llm_spancat`, | |||
| > llm = nlp.add_pipe("llm", config=config) | ||||
| > | ||||
| > # Construction via add_pipe with a task-specific factory and default GPT3.5 model | ||||
| > llm = nlp.add_pipe("llm-ner") | ||||
| > llm = nlp.add_pipe("llm_ner") | ||||
| > | ||||
| > # Construction via add_pipe with a task-specific factory and custom model | ||||
| > llm = nlp.add_pipe("llm_ner", config={"model": {"@llm_models": "spacy.Dolly.v1", "name": "dolly-v2-12b"}}) | ||||
| > | ||||
| > # Construction from class | ||||
| > from spacy_llm.pipeline import LLMWrapper | ||||
| > llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True) | ||||
| > ``` | ||||
| 
 | ||||
| ### LLMWrapper.\_\_init\_\_ {id="init",tag="method"} | ||||
| 
 | ||||
| Create a new pipeline instance. In your application, you would normally use a | ||||
| shortcut for this and instantiate the component using its string name and | ||||
| [`nlp.add_pipe`](/api/language#add_pipe). | ||||
|  | @ -225,8 +229,8 @@ All tasks are registered in the `llm_tasks` registry. | |||
| dataset across multiple storage units for easier processing and lookups. In | ||||
| `spacy-llm` we use this term (synonymously: "mapping") to describe the splitting | ||||
| up of prompts if they are too long for a model to handle, and "fusing" | ||||
| (synonymously: "reducing") to describe how the model responses for several shards | ||||
| are merged back together into a single document. | ||||
| (synonymously: "reducing") to describe how the model responses for several | ||||
| shards are merged back together into a single document. | ||||
| 
 | ||||
| Prompts are broken up in a manner that _always_ keeps the prompt in the template | ||||
| intact, meaning that the instructions to the LLM will always stay complete. The | ||||
|  | @ -1133,6 +1137,25 @@ supports `.yml`, `.yaml`, `.json` and `.jsonl`. | |||
| path = "textcat_examples.json" | ||||
| ``` | ||||
| 
 | ||||
| If you want to perform few-shot learning with a binary classifier (i. e. a text | ||||
| either should or should not be assigned to a given class), you can provide | ||||
| positive and negative examples with answers of "POS" or "NEG". "POS" means that | ||||
| this example should be assigned the class label defined in the configuration, | ||||
| "NEG" means it shouldn't. E. g. for spam classification: | ||||
| 
 | ||||
| ```json | ||||
| [ | ||||
|   { | ||||
|     "text": "You won the lottery! Wire a fee of 200$ to be able to withdraw your winnings.", | ||||
|     "answer": "POS" | ||||
|   }, | ||||
|   { | ||||
|     "text": "Your order #123456789 has arrived", | ||||
|     "answer": "NEG" | ||||
|   } | ||||
| ] | ||||
| ``` | ||||
| 
 | ||||
| ### REL {id="rel"} | ||||
| 
 | ||||
| The REL task extracts relations between named entities. | ||||
|  | @ -1484,7 +1507,7 @@ These models all take the same parameters: | |||
| > ```ini | ||||
| > [components.llm.model] | ||||
| > @llm_models = "spacy.Llama2.v1" | ||||
| > name = "llama2-7b-hf" | ||||
| > name = "Llama-2-7b-hf" | ||||
| > ``` | ||||
| 
 | ||||
| Currently, these models are provided as part of the core library: | ||||
|  |  | |||
|  | @ -162,7 +162,10 @@ network has an internal CNN Tok2Vec layer and uses attention. | |||
| 
 | ||||
| Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means | ||||
| that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not | ||||
| yet support that. | ||||
| yet support that. `TextCatCNN` has been replaced by the more general | ||||
| [`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is | ||||
| identical to `TextCatReduce` with `use_reduce_mean=true`, | ||||
| `use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`. | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
|  | @ -194,11 +197,58 @@ architecture is usually less accurate than the ensemble, but runs faster. | |||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ### spacy.TextCatCNN.v2 {id="TextCatCNN_v2"} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatCNN.v2" | ||||
| > exclusive_classes = false | ||||
| > nO = null | ||||
| > | ||||
| > [model.tok2vec] | ||||
| > @architectures = "spacy.HashEmbedCNN.v2" | ||||
| > pretrained_vectors = null | ||||
| > width = 96 | ||||
| > depth = 4 | ||||
| > embed_size = 2000 | ||||
| > window_size = 1 | ||||
| > maxout_pieces = 3 | ||||
| > subword_features = true | ||||
| > ``` | ||||
| 
 | ||||
| A neural network model where token vectors are calculated using a CNN. The | ||||
| vectors are mean pooled and used as features in a feed-forward network. This | ||||
| architecture is usually less accurate than the ensemble, but runs faster. | ||||
| 
 | ||||
| `TextCatCNN` has been replaced by the more general | ||||
| [`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is | ||||
| identical to `TextCatReduce` with `use_reduce_mean=true`, | ||||
| `use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`. | ||||
| 
 | ||||
| | Name                | Description                                                                                                                                                                                    | | ||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| <Accordion title="spacy.TextCatCNN.v1 definition" spaced> | ||||
| 
 | ||||
| [TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was | ||||
| not yet resizable. Since v2, new labels can be added to this component, even | ||||
| after training. | ||||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| ### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"} | ||||
| 
 | ||||
| Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means | ||||
| that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not | ||||
| yet support that. | ||||
| yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an | ||||
| erroneous sparse linear layer that only used a small number of the allocated | ||||
| parameters. | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
|  | @ -222,6 +272,33 @@ the others, but may not be as accurate, especially if texts are short. | |||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ### spacy.TextCatBOW.v2 {id="TextCatBOW"} | ||||
| 
 | ||||
| Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse | ||||
| linear layer that only used a small number of the allocated parameters. | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatBOW.v2" | ||||
| > exclusive_classes = false | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > nO = null | ||||
| > ``` | ||||
| 
 | ||||
| An n-gram "bag-of-words" model. This architecture should run much faster than | ||||
| the others, but may not be as accurate, especially if texts are short. | ||||
| 
 | ||||
| | Name                | Description                                                                                                                                                                                    | | ||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||
| | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | ||||
| | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | ||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} | ||||
| 
 | ||||
| Identical to | ||||
|  |  | |||
|  | @ -89,6 +89,21 @@ architectures and their arguments and hyperparameters. | |||
| | `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               | | ||||
| | `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        | | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
| 
 | ||||
| If you set a non-default value for `spans_key`, you'll have to update | ||||
| `[training.score_weights]` as well so that weights are computed properly. E. g. | ||||
| for `spans_key == "myspankey"`, include this in your config: | ||||
| 
 | ||||
| ```ini | ||||
| [training.score_weights] | ||||
| spans_myspankey_f = 1.0 | ||||
| spans_myspankey_p = 0.0 | ||||
| spans_myspankey_r = 0.0 | ||||
| ``` | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ```python | ||||
| %%GITHUB_SPACY/spacy/pipeline/spancat.py | ||||
| ``` | ||||
|  |  | |||
|  | @ -397,6 +397,17 @@ are wrapped into the | |||
| by this class. Instances of this class are typically assigned to the | ||||
| [`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Get the last hidden layer output for "is" (token index 1) | ||||
| > doc = nlp("This is a text.") | ||||
| > indices = doc._.trf_data.align[1].data.flatten() | ||||
| > last_hidden_state = doc._.trf_data.model_output.last_hidden_state | ||||
| > dim = last_hidden_state.shape[-1] | ||||
| > tensors = last_hidden_state.reshape(-1, dim)[indices] | ||||
| > ``` | ||||
| 
 | ||||
| | Name           | Description                                                                                                                                                                                                                                                                                                                          | | ||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `tokens`       | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                       | | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ between `Doc` objects. | |||
| <Infobox variant ="warning"> | ||||
| 
 | ||||
| Note that a `Vocab` instance is not static. It increases in size as texts with | ||||
| new tokens are processed. | ||||
| new tokens are processed. Some models may have an empty vocab at initialization. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
|  | @ -93,6 +93,7 @@ given string, you need to look it up in | |||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > nlp("I'm eating an apple") | ||||
| > apple = nlp.vocab.strings["apple"] | ||||
| > oov = nlp.vocab.strings["dskfodkfos"] | ||||
| > assert apple in nlp.vocab | ||||
|  |  | |||
| Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB | 
|  | @ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models: | |||
| 
 | ||||
| #### CNN/CPU pipelines with floret vectors | ||||
| 
 | ||||
| The Finnish, Korean and Swedish `md` and `lg` pipelines use | ||||
| [floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're | ||||
| running a trained pipeline on texts and working with [`Doc`](/api/doc) objects, | ||||
| you shouldn't notice any difference with floret vectors. With floret vectors no | ||||
| tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will | ||||
| return `False` for all tokens. | ||||
| The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg` | ||||
| pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors. | ||||
| If you're running a trained pipeline on texts and working with [`Doc`](/api/doc) | ||||
| objects, you shouldn't notice any difference with floret vectors. With floret | ||||
| vectors no tokens are out-of-vocabulary, so | ||||
| [`Token.is_oov`](/api/token#attributes) will return `False` for all tokens. | ||||
| 
 | ||||
| If you access vectors directly for similarity comparisons, there are a few | ||||
| differences because floret vectors don't include a fixed word list like the | ||||
|  | @ -132,10 +132,20 @@ vector keys for default vectors. | |||
| 
 | ||||
| ### Transformer pipeline design {id="design-trf"} | ||||
| 
 | ||||
| In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) | ||||
| all listen to the `transformer` component. The `attribute_ruler` and | ||||
| In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if | ||||
| present) all listen to the `transformer` component. The `attribute_ruler` and | ||||
| `lemmatizer` have the same configuration as in the CNN models. | ||||
| 
 | ||||
| For spaCy v3.0-v3.6, `trf` pipelines use | ||||
| [`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the | ||||
| transformer output in `doc._.trf_data` is a | ||||
| [`TransformerData`](/api/transformer#transformerdata) object. | ||||
| 
 | ||||
| For spaCy v3.7+, `trf` pipelines use | ||||
| [`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers) | ||||
| and `doc._.trf_data` is a | ||||
| [`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object. | ||||
| 
 | ||||
| ### Modifying the default pipeline {id="design-modify"} | ||||
| 
 | ||||
| For faster processing, you may only want to run a subset of the components in a | ||||
|  |  | |||
|  | @ -31,8 +31,6 @@ for ent in doc.ents: | |||
| Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what | ||||
| our example sentence and its named entities look like: | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualization of entities" | ||||
|   src="/images/displacy-ent1.html" | ||||
|   height={100} | ||||
| /> | ||||
| <Standalone height={120}> | ||||
| <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}><mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is looking at buying <mark style={{ background: '#feca74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>U.K. <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>GPE</span></mark> startup for <mark style={{ background: '#e4e7d2', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>$1 billion <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>MONEY</span></mark></div> | ||||
| </Standalone> | ||||
|  |  | |||
|  | @ -56,8 +56,7 @@ for token in doc: | |||
| Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what | ||||
| our example sentence and its dependencies look like: | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualization of dependencies and entities" | ||||
|   src="/images/displacy-long.html" | ||||
|   height={450} | ||||
| <ImageScrollable | ||||
|   src="/images/displacy-long.svg" | ||||
|   width={1975} | ||||
| /> | ||||
|  |  | |||
|  | @ -153,8 +153,9 @@ maxout_pieces = 3 | |||
| depth = 2 | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| ``` | ||||
|  | @ -170,8 +171,9 @@ factory = "textcat" | |||
| labels = [] | ||||
| 
 | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| nO = null | ||||
|  |  | |||
|  | @ -290,11 +290,7 @@ for token in doc: | |||
| | toward        | `prep`     | shift     | `NOUN`   | manufacturers           | | ||||
| | manufacturers | `pobj`     | toward    | `ADP`    |                         | | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualization of dependencies and entities 2" | ||||
|   src="/images/displacy-long2.html" | ||||
|   height={450} | ||||
| /> | ||||
| <ImageScrollable src="/images/displacy-long2.svg" width={1275} /> | ||||
| 
 | ||||
| Because the syntactic relations form a tree, every word has **exactly one | ||||
| head**. You can therefore iterate over the arcs in the tree by iterating over | ||||
|  | @ -709,11 +705,9 @@ doc = nlp(text) | |||
| displacy.serve(doc, style="ent") | ||||
| ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualizer for entities" | ||||
|   src="/images/displacy-ent2.html" | ||||
|   height={180} | ||||
| /> | ||||
| <Standalone height={180}> | ||||
| <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| ## Entity Linking {id="entity-linking"} | ||||
| 
 | ||||
|  | @ -723,6 +717,10 @@ identifier from a knowledge base (KB). You can create your own | |||
| [`KnowledgeBase`](/api/kb) and [train](/usage/training) a new | ||||
| [`EntityLinker`](/api/entitylinker) using that custom knowledge base. | ||||
| 
 | ||||
| As an example on how to define a KnowledgeBase and train an entity linker model, | ||||
| see [`this tutorial`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson) | ||||
| using [spaCy projects](/usage/projects). | ||||
| 
 | ||||
| ### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"} | ||||
| 
 | ||||
| The annotated KB identifier is accessible as either a hash value or as a string, | ||||
|  | @ -733,6 +731,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a | |||
| ```python | ||||
| import spacy | ||||
| 
 | ||||
| # "my_custom_el_pipeline" is assumed to be a custom NLP pipeline that was trained and serialized to disk | ||||
| nlp = spacy.load("my_custom_el_pipeline") | ||||
| doc = nlp("Ada Lovelace was born in London") | ||||
| 
 | ||||
|  |  | |||
|  | @ -1328,8 +1328,9 @@ labels = [] | |||
| # This function is created and then passed to the "textcat" component as | ||||
| # the argument "model" | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v2" | ||||
| @architectures = "spacy.TextCatBOW.v3" | ||||
| exclusive_classes = true | ||||
| length = 262144 | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
|  |  | |||
|  | @ -1144,10 +1144,9 @@ relations and tokens we want to match: | |||
| > displacy.serve(doc) | ||||
| > ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualization of dependencies" | ||||
|   src="/images/displacy-dep-founded.html" | ||||
|   height={450} | ||||
| <ImageScrollable | ||||
|   src="/images/displacy-dep-founded.svg" | ||||
|   width={925} | ||||
| /> | ||||
| 
 | ||||
| The relations we're interested in are: | ||||
|  |  | |||
|  | @ -405,7 +405,7 @@ available to spaCy, all you need to do is install the package in your | |||
| environment: | ||||
| 
 | ||||
| ```bash | ||||
| $ python setup.py develop | ||||
| $ python -m pip install . | ||||
| ``` | ||||
| 
 | ||||
| spaCy is now able to create the pipeline component `"snek"` – even though you | ||||
|  | @ -586,11 +586,9 @@ After installing the package, the custom colors will be used when visualizing | |||
| text with `displacy`. Whenever the label `SNEK` is assigned, it will be | ||||
| displayed in `#3dff74`. | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualization of entities" | ||||
|   src="/images/displacy-ent-snek.html" | ||||
|   height={100} | ||||
| /> | ||||
| <Standalone height={100}> | ||||
| <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>🌱🌿 <mark style={{ background: '#3dff74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>🐍 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>SNEK</span></mark> ____ 🌳🌲 ____ <mark style={{ background: '#cfc5ff', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>👨🌾 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>HUMAN</span></mark> 🏘️</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| ## Saving, loading and distributing trained pipelines {id="models"} | ||||
| 
 | ||||
|  | @ -675,7 +673,7 @@ $ python -m spacy package ./en_example_pipeline ./packages | |||
| ``` | ||||
| 
 | ||||
| This command will create a pipeline package directory and will run | ||||
| `python setup.py sdist` in that directory to create a binary `.whl` file or | ||||
| `python -m build` in that directory to create a binary `.whl` file or | ||||
| `.tar.gz` archive of your package that can be installed using `pip install`. | ||||
| Installing the binary wheel is usually more efficient. | ||||
| 
 | ||||
|  |  | |||
|  | @ -77,11 +77,9 @@ doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")] | |||
| displacy.serve(doc, style="span", options={"spans_key": "custom"}) | ||||
| ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualizer for overlapping spans" | ||||
|   src="/images/displacy-span.html" | ||||
|   height={180} | ||||
| /> | ||||
| <Standalone height={100}> | ||||
| <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| ## Additional features and improvements | ||||
| 
 | ||||
|  |  | |||
|  | @ -119,11 +119,9 @@ doc = nlp(text) | |||
| displacy.serve(doc, style="ent") | ||||
| ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualizer for entities" | ||||
|   src="/images/displacy-ent2.html" | ||||
|   height={180} | ||||
| /> | ||||
| <Standalone height={180}> | ||||
| <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| The entity visualizer lets you customize the following `options`: | ||||
| 
 | ||||
|  | @ -148,11 +146,9 @@ use the `colors` setting to add your own colors for them. | |||
| > displacy.serve(doc, style="ent", options=options) | ||||
| > ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualizer for entities (custom styling)" | ||||
|   src="/images/displacy-ent-custom.html" | ||||
|   height={225} | ||||
| /> | ||||
| <Standalone height={225}> | ||||
| <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>But <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is starting from behind. The company made a late push into hardware, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Siri, available on iPhones, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Amazon <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| The above example uses a little trick: Since the background color values are | ||||
| added as the `background` style attribute, you can use any | ||||
|  | @ -197,11 +193,9 @@ doc.spans["sc"] = [ | |||
| displacy.serve(doc, style="span") | ||||
| ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualizer for overlapping spans" | ||||
|   src="/images/displacy-span.html" | ||||
|   height={180} | ||||
| /> | ||||
| <Standalone height={100}> | ||||
| <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| The span visualizer lets you customize the following `options`: | ||||
| 
 | ||||
|  | @ -223,11 +217,9 @@ specify which one displaCy should use with `spans_key` (`sc` is the default). | |||
| > displacy.serve(doc, style="span", options=options) | ||||
| > ``` | ||||
| 
 | ||||
| <Iframe | ||||
|   title="displaCy visualizer for spans (custom spans_key)" | ||||
|   src="/images/displacy-span-custom.html" | ||||
|   height={225} | ||||
| /> | ||||
| <Standalone height={100}> | ||||
| <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#ddd', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#ddd', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>BANK</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span>.</div> | ||||
| </Standalone> | ||||
| 
 | ||||
| ## Using displaCy in Jupyter notebooks {id="jupyter"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -103,6 +103,10 @@ | |||
|             "has_examples": true, | ||||
|             "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"] | ||||
|         }, | ||||
|         { | ||||
|             "code": "fo", | ||||
|             "name": "Faroese" | ||||
|         }, | ||||
|         { | ||||
|             "code": "fr", | ||||
|             "name": "French", | ||||
|  | @ -290,6 +294,12 @@ | |||
|             "example": "Dit is een zin.", | ||||
|             "has_examples": true | ||||
|         }, | ||||
|         { | ||||
|             "code": "nn", | ||||
|             "name": "Norwegian Nynorsk", | ||||
|             "example": "Det er ein meir enn i same periode i fjor.", | ||||
|             "has_examples": true | ||||
|         }, | ||||
|         { | ||||
|             "code": "pl", | ||||
|             "name": "Polish", | ||||
|  |  | |||
|  | @ -9,14 +9,9 @@ | |||
|                     { "text": "Models & Languages", "url": "/usage/models" }, | ||||
|                     { "text": "Facts & Figures", "url": "/usage/facts-figures" }, | ||||
|                     { "text": "spaCy 101", "url": "/usage/spacy-101" }, | ||||
|                     { "text": "New in v3.0", "url": "/usage/v3" }, | ||||
|                     { "text": "New in v3.1", "url": "/usage/v3-1" }, | ||||
|                     { "text": "New in v3.2", "url": "/usage/v3-2" }, | ||||
|                     { "text": "New in v3.3", "url": "/usage/v3-3" }, | ||||
|                     { "text": "New in v3.4", "url": "/usage/v3-4" }, | ||||
|                     { "text": "New in v3.5", "url": "/usage/v3-5" }, | ||||
|                     { "text": "New in v3.7", "url": "/usage/v3-7" }, | ||||
|                     { "text": "New in v3.6", "url": "/usage/v3-6" }, | ||||
|                     { "text": "New in v3.7", "url": "/usage/v3-7" } | ||||
|                     { "text": "New in v3.5", "url": "/usage/v3-5" } | ||||
|                 ] | ||||
|             }, | ||||
|             { | ||||
|  |  | |||
|  | @ -66,6 +66,10 @@ | |||
|                 { | ||||
|                     "text": "Stack Overflow", | ||||
|                     "url": "http://stackoverflow.com/questions/tagged/spacy" | ||||
|                 }, | ||||
|                 { | ||||
|                     "text": "Merchandise", | ||||
|                     "url": "https://explosion.ai/merch" | ||||
|                 } | ||||
|             ] | ||||
|         }, | ||||
|  |  | |||
|  | @ -4500,6 +4500,23 @@ | |||
|                 "website": "https://nlp.unibuc.ro/people/snisioi.html" | ||||
|             }, | ||||
|             "category": ["pipeline", "training", "models"] | ||||
|         }, | ||||
|         { | ||||
|             "id": "redfield-spacy-nodes", | ||||
|             "title": "Redfield NLP Nodes for KNIME", | ||||
|             "slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.", | ||||
|             "description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).", | ||||
|             "github": "Redfield-AB/Spacy-Nodes", | ||||
|             "url": "https://redfield.ai/spacy-redfield/", | ||||
|             "thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png", | ||||
|             "image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png", | ||||
|             "author": "Redfield AB", | ||||
|             "author_links": { | ||||
|                 "twitter": "Redfield_AB", | ||||
|                 "github": "Redfield-AB", | ||||
|                 "website": "https://redfield.ai" | ||||
|             }, | ||||
|             "category": ["standalone"] | ||||
|         } | ||||
|     ], | ||||
| 
 | ||||
|  |  | |||
| Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 5.1 KiB | 
|  | @ -1,80 +0,0 @@ | |||
| <div | ||||
|     class="entities" | ||||
|     style=" | ||||
|         line-height: 2.5; | ||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, | ||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; | ||||
|         font-size: 18px; | ||||
|     " | ||||
|     >But | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: linear-gradient(90deg, #aa9cfc, #fc9ce7); | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|         >Google | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >ORG</span | ||||
|         ></mark | ||||
|     >is starting from behind. The company made a late push into hardware, and | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: linear-gradient(90deg, #aa9cfc, #fc9ce7); | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|         >Apple | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >ORG</span | ||||
|         ></mark | ||||
|     >’s Siri, available on iPhones, and | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: linear-gradient(90deg, #aa9cfc, #fc9ce7); | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|         >Amazon | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >ORG</span | ||||
|         ></mark | ||||
|     >’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer | ||||
|     adoption.</div | ||||
| > | ||||
|  | @ -1,59 +0,0 @@ | |||
| <div | ||||
|     class="entities" | ||||
|     style=" | ||||
|         line-height: 2.5; | ||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, | ||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; | ||||
|         font-size: 16px; | ||||
|     " | ||||
| > | ||||
|     🌱🌿 | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #3dff74; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|         >🐍 | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >SNEK</span | ||||
|         ></mark | ||||
|     > | ||||
|     ____ 🌳🌲 ____ | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #cfc5ff; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|         >👨🌾 | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >HUMAN</span | ||||
|         ></mark | ||||
|     > | ||||
|     🏘️ | ||||
| </div> | ||||
|  | @ -1,84 +0,0 @@ | |||
| <div | ||||
|     class="entities" | ||||
|     style=" | ||||
|         line-height: 2.5; | ||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, | ||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; | ||||
|         font-size: 16px; | ||||
|     " | ||||
| > | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #7aecec; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|     > | ||||
|         Apple | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >ORG</span | ||||
|         > | ||||
|     </mark> | ||||
|     is looking at buying | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #feca74; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|     > | ||||
|         U.K. | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >GPE</span | ||||
|         > | ||||
|     </mark> | ||||
|     startup for | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #e4e7d2; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|     > | ||||
|         $1 billion | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >MONEY</span | ||||
|         > | ||||
|     </mark> | ||||
| </div> | ||||
|  | @ -1,86 +0,0 @@ | |||
| <div | ||||
|     class="entities" | ||||
|     style=" | ||||
|         line-height: 2.5; | ||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, | ||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; | ||||
|         font-size: 18px; | ||||
|     " | ||||
| > | ||||
|     When | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #aa9cfc; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|     > | ||||
|         Sebastian Thrun | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >PERSON</span | ||||
|         > | ||||
|     </mark> | ||||
|     started working on self-driving cars at | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #7aecec; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|     > | ||||
|         Google | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >ORG</span | ||||
|         > | ||||
|     </mark> | ||||
|     in | ||||
|     <mark | ||||
|         class="entity" | ||||
|         style=" | ||||
|             background: #bfe1d9; | ||||
|             padding: 0.45em 0.6em; | ||||
|             margin: 0 0.25em; | ||||
|             line-height: 1; | ||||
|             border-radius: 0.35em; | ||||
|         " | ||||
|     > | ||||
|         2007 | ||||
|         <span | ||||
|             style=" | ||||
|                 font-size: 0.8em; | ||||
|                 font-weight: bold; | ||||
|                 line-height: 1; | ||||
|                 border-radius: 0.35em; | ||||
|                 text-transform: uppercase; | ||||
|                 vertical-align: middle; | ||||
|                 margin-left: 0.5rem; | ||||
|             " | ||||
|             >DATE</span | ||||
|         > | ||||
|     </mark> | ||||
|     , few people outside of the company took him seriously. | ||||
| </div> | ||||
| Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB | 
							
								
								
									
										212
									
								
								website/public/images/displacy-long2.svg
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,212 @@ | |||
| <svg | ||||
|     xmlns="http://www.w3.org/2000/svg" | ||||
|     xmlns:xlink="http://www.w3.org/1999/xlink" | ||||
|     id="0" | ||||
|     class="displacy" | ||||
|     width="1275" | ||||
|     height="399.5" | ||||
|     style=" | ||||
|         max-width: none; | ||||
|         height: 399.5px; | ||||
|         color: #000000; | ||||
|         background: #ffffff; | ||||
|         font-family: Arial; | ||||
|     " | ||||
| > | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="50">Autonomous</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="225">cars</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="400">shift</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">VERB</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="575">insurance</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="750">liability</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">NOUN</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="925">toward</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">ADP</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||
|         <tspan class="displacy-word" fill="currentColor" x="1100">manufacturers</tspan> | ||||
|         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">NOUN</tspan> | ||||
|     </text> | ||||
| 
 | ||||
|     <g class="displacy-arrow"> | ||||
|         <path | ||||
|             class="displacy-arc" | ||||
|             id="arrow-0-0" | ||||
|             stroke-width="2px" | ||||
|             d="M70,264.5 C70,177.0 215.0,177.0 215.0,264.5" | ||||
|             fill="none" | ||||
|             stroke="currentColor" | ||||
|         ></path> | ||||
|         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||
|             <textpath | ||||
|                 xlink:href="#arrow-0-0" | ||||
|                 class="displacy-label" | ||||
|                 startOffset="50%" | ||||
|                 fill="currentColor" | ||||
|                 text-anchor="middle" | ||||
|             > | ||||
|                 amod | ||||
|             </textpath> | ||||
|         </text> | ||||
|         <path | ||||
|             class="displacy-arrowhead" | ||||
|             d="M70,266.5 L62,254.5 78,254.5" | ||||
|             fill="currentColor" | ||||
|         ></path> | ||||
|     </g> | ||||
| 
 | ||||
|     <g class="displacy-arrow"> | ||||
|         <path | ||||
|             class="displacy-arc" | ||||
|             id="arrow-0-1" | ||||
|             stroke-width="2px" | ||||
|             d="M245,264.5 C245,177.0 390.0,177.0 390.0,264.5" | ||||
|             fill="none" | ||||
|             stroke="currentColor" | ||||
|         ></path> | ||||
|         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||
|             <textpath | ||||
|                 xlink:href="#arrow-0-1" | ||||
|                 class="displacy-label" | ||||
|                 startOffset="50%" | ||||
|                 fill="currentColor" | ||||
|                 text-anchor="middle" | ||||
|             > | ||||
|                 nsubj | ||||
|             </textpath> | ||||
|         </text> | ||||
|         <path | ||||
|             class="displacy-arrowhead" | ||||
|             d="M245,266.5 L237,254.5 253,254.5" | ||||
|             fill="currentColor" | ||||
|         ></path> | ||||
|     </g> | ||||
| 
 | ||||
|     <g class="displacy-arrow"> | ||||
|         <path | ||||
|             class="displacy-arc" | ||||
|             id="arrow-0-2" | ||||
|             stroke-width="2px" | ||||
|             d="M595,264.5 C595,177.0 740.0,177.0 740.0,264.5" | ||||
|             fill="none" | ||||
|             stroke="currentColor" | ||||
|         ></path> | ||||
|         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||
|             <textpath | ||||
|                 xlink:href="#arrow-0-2" | ||||
|                 class="displacy-label" | ||||
|                 startOffset="50%" | ||||
|                 fill="currentColor" | ||||
|                 text-anchor="middle" | ||||
|             > | ||||
|                 compound | ||||
|             </textpath> | ||||
|         </text> | ||||
|         <path | ||||
|             class="displacy-arrowhead" | ||||
|             d="M595,266.5 L587,254.5 603,254.5" | ||||
|             fill="currentColor" | ||||
|         ></path> | ||||
|     </g> | ||||
| 
 | ||||
|     <g class="displacy-arrow"> | ||||
|         <path | ||||
|             class="displacy-arc" | ||||
|             id="arrow-0-3" | ||||
|             stroke-width="2px" | ||||
|             d="M420,264.5 C420,89.5 745.0,89.5 745.0,264.5" | ||||
|             fill="none" | ||||
|             stroke="currentColor" | ||||
|         ></path> | ||||
|         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||
|             <textpath | ||||
|                 xlink:href="#arrow-0-3" | ||||
|                 class="displacy-label" | ||||
|                 startOffset="50%" | ||||
|                 fill="currentColor" | ||||
|                 text-anchor="middle" | ||||
|             > | ||||
|                 dobj | ||||
|             </textpath> | ||||
|         </text> | ||||
|         <path | ||||
|             class="displacy-arrowhead" | ||||
|             d="M745.0,266.5 L753.0,254.5 737.0,254.5" | ||||
|             fill="currentColor" | ||||
|         ></path> | ||||
|     </g> | ||||
| 
 | ||||
|     <g class="displacy-arrow"> | ||||
|         <path | ||||
|             class="displacy-arc" | ||||
|             id="arrow-0-4" | ||||
|             stroke-width="2px" | ||||
|             d="M420,264.5 C420,2.0 925.0,2.0 925.0,264.5" | ||||
|             fill="none" | ||||
|             stroke="currentColor" | ||||
|         ></path> | ||||
|         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||
|             <textpath | ||||
|                 xlink:href="#arrow-0-4" | ||||
|                 class="displacy-label" | ||||
|                 startOffset="50%" | ||||
|                 fill="currentColor" | ||||
|                 text-anchor="middle" | ||||
|             > | ||||
|                 prep | ||||
|             </textpath> | ||||
|         </text> | ||||
|         <path | ||||
|             class="displacy-arrowhead" | ||||
|             d="M925.0,266.5 L933.0,254.5 917.0,254.5" | ||||
|             fill="currentColor" | ||||
|         ></path> | ||||
|     </g> | ||||
| 
 | ||||
|     <g class="displacy-arrow"> | ||||
|         <path | ||||
|             class="displacy-arc" | ||||
|             id="arrow-0-5" | ||||
|             stroke-width="2px" | ||||
|             d="M945,264.5 C945,177.0 1090.0,177.0 1090.0,264.5" | ||||
|             fill="none" | ||||
|             stroke="currentColor" | ||||
|         ></path> | ||||
|         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||
|             <textpath | ||||
|                 xlink:href="#arrow-0-5" | ||||
|                 class="displacy-label" | ||||
|                 startOffset="50%" | ||||
|                 fill="currentColor" | ||||
|                 text-anchor="middle" | ||||
|             > | ||||
|                 pobj | ||||
|             </textpath> | ||||
|         </text> | ||||
|         <path | ||||
|             class="displacy-arrowhead" | ||||
|             d="M1090.0,266.5 L1098.0,254.5 1082.0,254.5" | ||||
|             fill="currentColor" | ||||
|         ></path> | ||||
|     </g> | ||||
| </svg> | ||||
| After Width: | Height: | Size: 6.8 KiB | 
|  | @ -1,84 +0,0 @@ | |||
| <div | ||||
|     class="spans" | ||||
|     style=" | ||||
|         line-height: 2.5; | ||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, | ||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; | ||||
|         font-size: 18px; | ||||
|         direction: ltr; | ||||
|     " | ||||
| > | ||||
|     Welcome to the | ||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> | ||||
|         Bank | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #ddd; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #ddd; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 border-top-left-radius: 3px; | ||||
|                 border-bottom-left-radius: 3px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|             <span | ||||
|                 style=" | ||||
|                     background: #ddd; | ||||
|                     color: #000; | ||||
|                     top: -0.5em; | ||||
|                     padding: 2px 3px; | ||||
|                     position: absolute; | ||||
|                     font-size: 0.6em; | ||||
|                     font-weight: bold; | ||||
|                     line-height: 1; | ||||
|                     border-radius: 3px; | ||||
|                 " | ||||
|             > | ||||
|                 BANK | ||||
|             </span> | ||||
|         </span> | ||||
|     </span> | ||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> | ||||
|         of | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #ddd; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|     </span> | ||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> | ||||
|         China | ||||
| 
 | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #ddd; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|     </span> | ||||
|     . | ||||
| </div> | ||||
|  | @ -1,123 +0,0 @@ | |||
| <div | ||||
|     class="spans" | ||||
|     style=" | ||||
|         line-height: 2.5; | ||||
|         direction: ltr; | ||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, | ||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; | ||||
|         font-size: 18px; | ||||
|     " | ||||
| > | ||||
|     Welcome to the | ||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> | ||||
|         Bank | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #7aecec; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #7aecec; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 border-top-left-radius: 3px; | ||||
|                 border-bottom-left-radius: 3px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|             <span | ||||
|                 style=" | ||||
|                     background: #7aecec; | ||||
|                     color: #000; | ||||
|                     top: -0.5em; | ||||
|                     padding: 2px 3px; | ||||
|                     position: absolute; | ||||
|                     font-size: 0.6em; | ||||
|                     font-weight: bold; | ||||
|                     line-height: 1; | ||||
|                     border-radius: 3px; | ||||
|                 " | ||||
|             > | ||||
|                 ORG | ||||
|             </span> | ||||
|         </span> | ||||
|     </span> | ||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> | ||||
|         of | ||||
| 
 | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #7aecec; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|     </span> | ||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> | ||||
|         China | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #7aecec; | ||||
|                 top: 40px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #feca74; | ||||
|                 top: 57px; | ||||
|                 height: 4px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|         </span> | ||||
|         <span | ||||
|             style=" | ||||
|                 background: #feca74; | ||||
|                 top: 57px; | ||||
|                 height: 4px; | ||||
|                 border-top-left-radius: 3px; | ||||
|                 border-bottom-left-radius: 3px; | ||||
|                 left: -1px; | ||||
|                 width: calc(100% + 2px); | ||||
|                 position: absolute; | ||||
|             " | ||||
|         > | ||||
|             <span | ||||
|                 style=" | ||||
|                     background: #feca74; | ||||
|                     color: #000; | ||||
|                     top: -0.5em; | ||||
|                     padding: 2px 3px; | ||||
|                     position: absolute; | ||||
|                     font-size: 0.6em; | ||||
|                     font-weight: bold; | ||||
|                     line-height: 1; | ||||
|                     border-radius: 3px; | ||||
|                 " | ||||
|             > | ||||
|                 GPE | ||||
|             </span> | ||||
|         </span> | ||||
|     </span> | ||||
|     . | ||||
| </div> | ||||
|  | @ -107,6 +107,22 @@ const Image = ({ src, alt, title, href, ...props }) => { | |||
|     ) | ||||
| } | ||||
| 
 | ||||
| const ImageScrollable = ({ src, alt, width, ...props }) => { | ||||
|     return ( | ||||
|         <figure className={classNames(classes.standalone, classes.scrollable)}> | ||||
|             <img className={classes['image-scrollable']} src={src} alt={alt} width={width} height="auto" /> | ||||
|         </figure> | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| const Standalone = ({ height, children, ...props }) => { | ||||
|     return ( | ||||
|         <figure className={classes.standalone} style={{ height }}> | ||||
|             {children} | ||||
|         </figure> | ||||
|     ) | ||||
| } | ||||
| 
 | ||||
| const ImageFill = ({ image, ...props }) => { | ||||
|     return ( | ||||
|         <span | ||||
|  | @ -137,4 +153,4 @@ const GoogleSheet = ({ id, link, height, button = 'View full table' }) => { | |||
|     ) | ||||
| } | ||||
| 
 | ||||
| export { YouTube, SoundCloud, Iframe, Image, ImageFill, GoogleSheet } | ||||
| export { YouTube, SoundCloud, Iframe, Image, ImageFill, ImageScrollable, GoogleSheet, Standalone } | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ import Aside from './components/aside' | |||
| import Button from './components/button' | ||||
| import Tag from './components/tag' | ||||
| import Grid from './components/grid' | ||||
| import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from './components/embed' | ||||
| import { YouTube, SoundCloud, Iframe, Image, ImageScrollable, GoogleSheet, Standalone } from './components/embed' | ||||
| import Project from './widgets/project' | ||||
| import { Integration, IntegrationLogo } from './widgets/integration.js' | ||||
| import { Logos, Colors, Patterns } from './widgets/styleguide' | ||||
|  | @ -90,6 +90,8 @@ export const remarkComponents = { | |||
|      * For regular img elements it is not possible to pass properties | ||||
|      */ | ||||
|     Image, | ||||
|     ImageScrollable, | ||||
|     Standalone, | ||||
| 
 | ||||
|     Label, | ||||
|     Logos, | ||||
|  |  | |||