Merge pull request #13286 from explosion/master
Sync `docs/llm_main` with `master`
							
								
								
									
										1
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions] | ||||||
							
								
								
									
										4
									
								
								.github/workflows/tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						|  | @ -58,7 +58,7 @@ jobs: | ||||||
|       fail-fast: true |       fail-fast: true | ||||||
|       matrix: |       matrix: | ||||||
|         os: [ubuntu-latest, windows-latest, macos-latest] |         os: [ubuntu-latest, windows-latest, macos-latest] | ||||||
|         python_version: ["3.11", "3.12.0-rc.2"] |         python_version: ["3.12"] | ||||||
|         include: |         include: | ||||||
|           - os: windows-latest |           - os: windows-latest | ||||||
|             python_version: "3.7" |             python_version: "3.7" | ||||||
|  | @ -68,6 +68,8 @@ jobs: | ||||||
|             python_version: "3.9" |             python_version: "3.9" | ||||||
|           - os: windows-latest |           - os: windows-latest | ||||||
|             python_version: "3.10" |             python_version: "3.10" | ||||||
|  |           - os: macos-latest | ||||||
|  |             python_version: "3.11" | ||||||
| 
 | 
 | ||||||
|     runs-on: ${{ matrix.os }} |     runs-on: ${{ matrix.os }} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						|  | @ -1,6 +1,6 @@ | ||||||
| The MIT License (MIT) | The MIT License (MIT) | ||||||
| 
 | 
 | ||||||
| Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal | Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal | ||||||
| 
 | 
 | ||||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | Permission is hereby granted, free of charge, to any person obtaining a copy | ||||||
| of this software and associated documentation files (the "Software"), to deal | of this software and associated documentation files (the "Software"), to deal | ||||||
|  |  | ||||||
							
								
								
									
										11
									
								
								README.md
									
									
									
									
									
								
							
							
						
						|  | @ -39,28 +39,35 @@ open-source software, released under the | ||||||
| | 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                                               | | | 🚀 **[New in v3.0]**                                                                                                                                                                                                      | New features, backwards incompatibilities and migration guide.                                                                                                                                                                                                                                                                               | | ||||||
| | 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                                          | | | 🪐 **[Project Templates]**                                                                                                                                                                                                | End-to-end workflows you can clone, modify and run.                                                                                                                                                                                                                                                                                          | | ||||||
| | 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                                      | | | 🎛 **[API Reference]**                                                                                                                                                                                                     | The detailed reference for spaCy's API.                                                                                                                                                                                                                                                                                                      | | ||||||
|  | | ⏩ **[GPU Processing]**                                                                                                                                                                                                    | Use spaCy with CUDA-compatible GPU processing.                                                                                                                                                                                                                                                                                               | | ||||||
| | 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                                        | | | 📦 **[Models]**                                                                                                                                                                                                           | Download trained pipelines for spaCy.                                                                                                                                                                                                                                                                                                        | | ||||||
|  | | 🦙 **[Large Language Models]**                                                                                                                                                                                            | Integrate LLMs into spaCy pipelines.                                                                                                                                                                                                                                                                                                        | | ||||||
| | 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                                               | | | 🌌 **[Universe]**                                                                                                                                                                                                         | Plugins, extensions, demos and books from the spaCy ecosystem.                                                                                                                                                                                                                                                                               | | ||||||
| | ⚙️ **[spaCy VS Code Extension]**                                                                                                                                                                                          | Additional tooling and features for working with spaCy's config files.                                                                                                                                                                                                                                                                       | | | ⚙️ **[spaCy VS Code Extension]**                                                                                                                                                                                          | Additional tooling and features for working with spaCy's config files.                                                                                                                                                                                                                                                                       | | ||||||
| | 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                                      | | | 👩🏫 **[Online Course]**                                                                                                                                                                                                    | Learn spaCy in this free and interactive online course.                                                                                                                                                                                                                                                                                      | | ||||||
|  | | 📰 **[Blog]**                                                                                                                                                                                                             | Read about current spaCy and Prodigy development, releases, talks and more from Explosion.                                                                                                                                                                                                                 | | ||||||
| | 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    | | | 📺 **[Videos]**                                                                                                                                                                                                           | Our YouTube channel with video tutorials, talks and more.                                                                                                                                                                                                                                                                                    | | ||||||
| | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 | | | 🛠 **[Changelog]**                                                                                                                                                                                                         | Changes and version history.                                                                                                                                                                                                                                                                                                                 | | ||||||
| | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        | | | 💝 **[Contribute]**                                                                                                                                                                                                       | How to contribute to the spaCy project and code base.                                                                                                                                                                                                                                                                                        | | ||||||
| | <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)**                 | | | 👕 **[Swag]**                                                                                                                                                                                                             | Support us and our work with unique, custom-designed swag!                                                                                                                                                                                                                                                                                   | | ||||||
| | <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a>   | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** | | | <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)**                 | | ||||||
| 
 | 
 | ||||||
| [spacy 101]: https://spacy.io/usage/spacy-101 | [spacy 101]: https://spacy.io/usage/spacy-101 | ||||||
| [new in v3.0]: https://spacy.io/usage/v3 | [new in v3.0]: https://spacy.io/usage/v3 | ||||||
| [usage guides]: https://spacy.io/usage/ | [usage guides]: https://spacy.io/usage/ | ||||||
| [api reference]: https://spacy.io/api/ | [api reference]: https://spacy.io/api/ | ||||||
|  | [gpu processing]: https://spacy.io/usage#gpu | ||||||
| [models]: https://spacy.io/models | [models]: https://spacy.io/models | ||||||
|  | [large language models]: https://spacy.io/usage/large-language-models | ||||||
| [universe]: https://spacy.io/universe | [universe]: https://spacy.io/universe | ||||||
| [spacy vs code extension]: https://github.com/explosion/spacy-vscode | [spacy vs code extension]: https://github.com/explosion/spacy-vscode | ||||||
| [videos]: https://www.youtube.com/c/ExplosionAI | [videos]: https://www.youtube.com/c/ExplosionAI | ||||||
| [online course]: https://course.spacy.io | [online course]: https://course.spacy.io | ||||||
|  | [blog]: https://explosion.ai | ||||||
| [project templates]: https://github.com/explosion/projects | [project templates]: https://github.com/explosion/projects | ||||||
| [changelog]: https://spacy.io/usage#changelog | [changelog]: https://spacy.io/usage#changelog | ||||||
| [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md | ||||||
|  | [swag]: https://explosion.ai/merch | ||||||
| 
 | 
 | ||||||
| ## 💬 Where to ask questions | ## 💬 Where to ask questions | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||||
| SOFTWARE. | SOFTWARE. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | SciPy | ||||||
|  | ----- | ||||||
|  | 
 | ||||||
|  | * Files: scorer.py | ||||||
|  | 
 | ||||||
|  | The implementation of trapezoid() is adapted from SciPy, which is distributed | ||||||
|  | under the following license: | ||||||
|  | 
 | ||||||
|  | New BSD License | ||||||
|  | 
 | ||||||
|  | Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers. | ||||||
|  | All rights reserved. | ||||||
|  | 
 | ||||||
|  | Redistribution and use in source and binary forms, with or without | ||||||
|  | modification, are permitted provided that the following conditions | ||||||
|  | are met: | ||||||
|  | 
 | ||||||
|  | 1. Redistributions of source code must retain the above copyright | ||||||
|  |    notice, this list of conditions and the following disclaimer. | ||||||
|  | 
 | ||||||
|  | 2. Redistributions in binary form must reproduce the above | ||||||
|  |    copyright notice, this list of conditions and the following | ||||||
|  |    disclaimer in the documentation and/or other materials provided | ||||||
|  |    with the distribution. | ||||||
|  | 
 | ||||||
|  | 3. Neither the name of the copyright holder nor the names of its | ||||||
|  |    contributors may be used to endorse or promote products derived | ||||||
|  |    from this software without specific prior written permission. | ||||||
|  | 
 | ||||||
|  | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||||||
|  | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||||||
|  | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||||||
|  | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||||||
|  | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||||||
|  | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||||||
|  | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||||||
|  | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||||||
|  | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||||||
|  | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||||||
|  | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||||
|  |  | ||||||
|  | @ -5,7 +5,7 @@ requires = [ | ||||||
|     "cymem>=2.0.2,<2.1.0", |     "cymem>=2.0.2,<2.1.0", | ||||||
|     "preshed>=3.0.2,<3.1.0", |     "preshed>=3.0.2,<3.1.0", | ||||||
|     "murmurhash>=0.28.0,<1.1.0", |     "murmurhash>=0.28.0,<1.1.0", | ||||||
|     "thinc>=8.1.8,<8.3.0", |     "thinc>=8.2.2,<8.3.0", | ||||||
|     "numpy>=1.15.0; python_version < '3.9'", |     "numpy>=1.15.0; python_version < '3.9'", | ||||||
|     "numpy>=1.25.0; python_version >= '3.9'", |     "numpy>=1.25.0; python_version >= '3.9'", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0 | ||||||
| spacy-loggers>=1.0.0,<2.0.0 | spacy-loggers>=1.0.0,<2.0.0 | ||||||
| cymem>=2.0.2,<2.1.0 | cymem>=2.0.2,<2.1.0 | ||||||
| preshed>=3.0.2,<3.1.0 | preshed>=3.0.2,<3.1.0 | ||||||
| thinc>=8.1.8,<8.3.0 | thinc>=8.2.2,<8.3.0 | ||||||
| ml_datasets>=0.2.0,<0.3.0 | ml_datasets>=0.2.0,<0.3.0 | ||||||
| murmurhash>=0.28.0,<1.1.0 | murmurhash>=0.28.0,<1.1.0 | ||||||
| wasabi>=0.9.1,<1.2.0 | wasabi>=0.9.1,<1.2.0 | ||||||
|  |  | ||||||
|  | @ -41,7 +41,7 @@ setup_requires = | ||||||
|     cymem>=2.0.2,<2.1.0 |     cymem>=2.0.2,<2.1.0 | ||||||
|     preshed>=3.0.2,<3.1.0 |     preshed>=3.0.2,<3.1.0 | ||||||
|     murmurhash>=0.28.0,<1.1.0 |     murmurhash>=0.28.0,<1.1.0 | ||||||
|     thinc>=8.1.8,<8.3.0 |     thinc>=8.2.2,<8.3.0 | ||||||
| install_requires = | install_requires = | ||||||
|     # Our libraries |     # Our libraries | ||||||
|     spacy-legacy>=3.0.11,<3.1.0 |     spacy-legacy>=3.0.11,<3.1.0 | ||||||
|  | @ -49,7 +49,7 @@ install_requires = | ||||||
|     murmurhash>=0.28.0,<1.1.0 |     murmurhash>=0.28.0,<1.1.0 | ||||||
|     cymem>=2.0.2,<2.1.0 |     cymem>=2.0.2,<2.1.0 | ||||||
|     preshed>=3.0.2,<3.1.0 |     preshed>=3.0.2,<3.1.0 | ||||||
|     thinc>=8.1.8,<8.3.0 |     thinc>=8.2.2,<8.3.0 | ||||||
|     wasabi>=0.9.1,<1.2.0 |     wasabi>=0.9.1,<1.2.0 | ||||||
|     srsly>=2.4.3,<3.0.0 |     srsly>=2.4.3,<3.0.0 | ||||||
|     catalogue>=2.0.6,<2.1.0 |     catalogue>=2.0.6,<2.1.0 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| # fmt: off | # fmt: off | ||||||
| __title__ = "spacy" | __title__ = "spacy" | ||||||
| __version__ = "3.7.1" | __version__ = "3.7.2" | ||||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||||
|  |  | ||||||
|  | @ -22,8 +22,17 @@ from .init_pipeline import init_pipeline_cli  # noqa: F401 | ||||||
| from .package import package  # noqa: F401 | from .package import package  # noqa: F401 | ||||||
| from .pretrain import pretrain  # noqa: F401 | from .pretrain import pretrain  # noqa: F401 | ||||||
| from .profile import profile  # noqa: F401 | from .profile import profile  # noqa: F401 | ||||||
| from .train import train_cli  # noqa: F401 | from .project.assets import project_assets  # type: ignore[attr-defined]  # noqa: F401 | ||||||
| from .validate import validate  # noqa: F401 | from .project.clone import project_clone  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  | from .project.document import (  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  |     project_document, | ||||||
|  | ) | ||||||
|  | from .project.dvc import project_update_dvc  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  | from .project.pull import project_pull  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  | from .project.push import project_push  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  | from .project.run import project_run  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  | from .train import train_cli  # type: ignore[attr-defined]  # noqa: F401 | ||||||
|  | from .validate import validate  # type: ignore[attr-defined]  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) | ||||||
|  |  | ||||||
|  | @ -13,7 +13,7 @@ from .. import util | ||||||
| from ..language import Language | from ..language import Language | ||||||
| from ..tokens import Doc | from ..tokens import Doc | ||||||
| from ..training import Corpus | from ..training import Corpus | ||||||
| from ._util import Arg, Opt, benchmark_cli, setup_gpu | from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @benchmark_cli.command( | @benchmark_cli.command( | ||||||
|  | @ -30,12 +30,14 @@ def benchmark_speed_cli( | ||||||
|     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), |     use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), | ||||||
|     n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), |     n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), | ||||||
|     warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), |     warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), | ||||||
|  |     code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), | ||||||
|     # fmt: on |     # fmt: on | ||||||
| ): | ): | ||||||
|     """ |     """ | ||||||
|     Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark |     Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark | ||||||
|     data in the binary .spacy format. |     data in the binary .spacy format. | ||||||
|     """ |     """ | ||||||
|  |     import_code(code_path) | ||||||
|     setup_gpu(use_gpu=use_gpu, silent=False) |     setup_gpu(use_gpu=use_gpu, silent=False) | ||||||
| 
 | 
 | ||||||
|     nlp = util.load_model(model) |     nlp = util.load_model(model) | ||||||
|  | @ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray): | ||||||
| def warmup( | def warmup( | ||||||
|     nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] |     nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] | ||||||
| ) -> numpy.ndarray: | ) -> numpy.ndarray: | ||||||
|     docs = warmup_epochs * docs |     docs = [doc.copy() for doc in docs * warmup_epochs] | ||||||
|     return annotate(nlp, docs, batch_size) |     return annotate(nlp, docs, batch_size) | ||||||
|  |  | ||||||
|  | @ -7,7 +7,14 @@ from wasabi import msg | ||||||
| 
 | 
 | ||||||
| from .. import about | from .. import about | ||||||
| from ..errors import OLD_MODEL_SHORTCUTS | from ..errors import OLD_MODEL_SHORTCUTS | ||||||
| from ..util import get_minor_version, is_package, is_prerelease_version, run_command | from ..util import ( | ||||||
|  |     get_minor_version, | ||||||
|  |     is_in_interactive, | ||||||
|  |     is_in_jupyter, | ||||||
|  |     is_package, | ||||||
|  |     is_prerelease_version, | ||||||
|  |     run_command, | ||||||
|  | ) | ||||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app | from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -77,6 +84,27 @@ def download( | ||||||
|         "Download and installation successful", |         "Download and installation successful", | ||||||
|         f"You can now load the package via spacy.load('{model_name}')", |         f"You can now load the package via spacy.load('{model_name}')", | ||||||
|     ) |     ) | ||||||
|  |     if is_in_jupyter(): | ||||||
|  |         reload_deps_msg = ( | ||||||
|  |             "If you are in a Jupyter or Colab notebook, you may need to " | ||||||
|  |             "restart Python in order to load all the package's dependencies. " | ||||||
|  |             "You can do this by selecting the 'Restart kernel' or 'Restart " | ||||||
|  |             "runtime' option." | ||||||
|  |         ) | ||||||
|  |         msg.warn( | ||||||
|  |             "Restart to reload dependencies", | ||||||
|  |             reload_deps_msg, | ||||||
|  |         ) | ||||||
|  |     elif is_in_interactive(): | ||||||
|  |         reload_deps_msg = ( | ||||||
|  |             "If you are in an interactive Python session, you may need to " | ||||||
|  |             "exit and restart Python to load all the package's dependencies. " | ||||||
|  |             "You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)." | ||||||
|  |         ) | ||||||
|  |         msg.warn( | ||||||
|  |             "Restart to reload dependencies", | ||||||
|  |             reload_deps_msg, | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: | ||||||
|  |  | ||||||
|  | @ -1,5 +1,7 @@ | ||||||
|  | import os | ||||||
| import re | import re | ||||||
| import shutil | import shutil | ||||||
|  | import subprocess | ||||||
| import sys | import sys | ||||||
| from collections import defaultdict | from collections import defaultdict | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | @ -11,6 +13,7 @@ from thinc.api import Config | ||||||
| from wasabi import MarkdownRenderer, Printer, get_raw_input | from wasabi import MarkdownRenderer, Printer, get_raw_input | ||||||
| 
 | 
 | ||||||
| from .. import about, util | from .. import about, util | ||||||
|  | from ..compat import importlib_metadata | ||||||
| from ..schemas import ModelMetaSchema, validate | from ..schemas import ModelMetaSchema, validate | ||||||
| from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list | from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list | ||||||
| 
 | 
 | ||||||
|  | @ -35,7 +38,7 @@ def package_cli( | ||||||
|     specified output directory, and the data will be copied over. If |     specified output directory, and the data will be copied over. If | ||||||
|     --create-meta is set and a meta.json already exists in the output directory, |     --create-meta is set and a meta.json already exists in the output directory, | ||||||
|     the existing values will be used as the defaults in the command-line prompt. |     the existing values will be used as the defaults in the command-line prompt. | ||||||
|     After packaging, "python setup.py sdist" is run in the package directory, |     After packaging, "python -m build --sdist" is run in the package directory, | ||||||
|     which will create a .tar.gz archive that can be installed via "pip install". |     which will create a .tar.gz archive that can be installed via "pip install". | ||||||
| 
 | 
 | ||||||
|     If additional code files are provided (e.g. Python files containing custom |     If additional code files are provided (e.g. Python files containing custom | ||||||
|  | @ -78,9 +81,17 @@ def package( | ||||||
|     input_path = util.ensure_path(input_dir) |     input_path = util.ensure_path(input_dir) | ||||||
|     output_path = util.ensure_path(output_dir) |     output_path = util.ensure_path(output_dir) | ||||||
|     meta_path = util.ensure_path(meta_path) |     meta_path = util.ensure_path(meta_path) | ||||||
|     if create_wheel and not has_wheel(): |     if create_wheel and not has_wheel() and not has_build(): | ||||||
|         err = "Generating a binary .whl file requires wheel to be installed" |         err = ( | ||||||
|         msg.fail(err, "pip install wheel", exits=1) |             "Generating wheels requires 'build' or 'wheel' (deprecated) to be installed" | ||||||
|  |         ) | ||||||
|  |         msg.fail(err, "pip install build", exits=1) | ||||||
|  |     if not has_build(): | ||||||
|  |         msg.warn( | ||||||
|  |             "Generating packages without the 'build' package is deprecated and " | ||||||
|  |             "will not be supported in the future. To install 'build': pip " | ||||||
|  |             "install build" | ||||||
|  |         ) | ||||||
|     if not input_path or not input_path.exists(): |     if not input_path or not input_path.exists(): | ||||||
|         msg.fail("Can't locate pipeline data", input_path, exits=1) |         msg.fail("Can't locate pipeline data", input_path, exits=1) | ||||||
|     if not output_path or not output_path.exists(): |     if not output_path or not output_path.exists(): | ||||||
|  | @ -184,12 +195,37 @@ def package( | ||||||
|     msg.good(f"Successfully created package directory '{model_name_v}'", main_path) |     msg.good(f"Successfully created package directory '{model_name_v}'", main_path) | ||||||
|     if create_sdist: |     if create_sdist: | ||||||
|         with util.working_dir(main_path): |         with util.working_dir(main_path): | ||||||
|             util.run_command([sys.executable, "setup.py", "sdist"], capture=False) |             # run directly, since util.run_command is not designed to continue | ||||||
|  |             # after a command fails | ||||||
|  |             ret = subprocess.run( | ||||||
|  |                 [sys.executable, "-m", "build", ".", "--sdist"], | ||||||
|  |                 env=os.environ.copy(), | ||||||
|  |             ) | ||||||
|  |             if ret.returncode != 0: | ||||||
|  |                 msg.warn( | ||||||
|  |                     "Creating sdist with 'python -m build' failed. Falling " | ||||||
|  |                     "back to deprecated use of 'python setup.py sdist'" | ||||||
|  |                 ) | ||||||
|  |                 util.run_command([sys.executable, "setup.py", "sdist"], capture=False) | ||||||
|         zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" |         zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" | ||||||
|         msg.good(f"Successfully created zipped Python package", zip_file) |         msg.good(f"Successfully created zipped Python package", zip_file) | ||||||
|     if create_wheel: |     if create_wheel: | ||||||
|         with util.working_dir(main_path): |         with util.working_dir(main_path): | ||||||
|             util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) |             # run directly, since util.run_command is not designed to continue | ||||||
|  |             # after a command fails | ||||||
|  |             ret = subprocess.run( | ||||||
|  |                 [sys.executable, "-m", "build", ".", "--wheel"], | ||||||
|  |                 env=os.environ.copy(), | ||||||
|  |             ) | ||||||
|  |             if ret.returncode != 0: | ||||||
|  |                 msg.warn( | ||||||
|  |                     "Creating wheel with 'python -m build' failed. Falling " | ||||||
|  |                     "back to deprecated use of 'wheel' with " | ||||||
|  |                     "'python setup.py bdist_wheel'" | ||||||
|  |                 ) | ||||||
|  |                 util.run_command( | ||||||
|  |                     [sys.executable, "setup.py", "bdist_wheel"], capture=False | ||||||
|  |                 ) | ||||||
|         wheel_name_squashed = re.sub("_+", "_", model_name_v) |         wheel_name_squashed = re.sub("_+", "_", model_name_v) | ||||||
|         wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" |         wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" | ||||||
|         msg.good(f"Successfully created binary wheel", wheel) |         msg.good(f"Successfully created binary wheel", wheel) | ||||||
|  | @ -209,6 +245,17 @@ def has_wheel() -> bool: | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def has_build() -> bool: | ||||||
|  |     # it's very likely that there is a local directory named build/ (especially | ||||||
|  |     # in an editable install), so an import check is not sufficient; instead | ||||||
|  |     # check that there is a package version | ||||||
|  |     try: | ||||||
|  |         importlib_metadata.version("build") | ||||||
|  |         return True | ||||||
|  |     except importlib_metadata.PackageNotFoundError:  # type: ignore[attr-defined] | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_third_party_dependencies( | def get_third_party_dependencies( | ||||||
|     config: Config, exclude: List[str] = util.SimpleFrozenList() |     config: Config, exclude: List[str] = util.SimpleFrozenList() | ||||||
| ) -> List[str]: | ) -> List[str]: | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/cli/project/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										1
									
								
								spacy/cli/project/assets.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.assets import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/clone.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.clone import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/document.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.document import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/dvc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.dvc import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/pull.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.pull import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/push.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.push import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/remote_storage.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.remote_storage import * | ||||||
							
								
								
									
										1
									
								
								spacy/cli/project/run.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1 @@ | ||||||
|  | from weasel.cli.run import * | ||||||
|  | @ -271,8 +271,9 @@ grad_factor = 1.0 | ||||||
| @layers = "reduce_mean.v1" | @layers = "reduce_mean.v1" | ||||||
| 
 | 
 | ||||||
| [components.textcat.model.linear_model] | [components.textcat.model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| 
 | 
 | ||||||
|  | @ -308,8 +309,9 @@ grad_factor = 1.0 | ||||||
| @layers = "reduce_mean.v1" | @layers = "reduce_mean.v1" | ||||||
| 
 | 
 | ||||||
| [components.textcat_multilabel.model.linear_model] | [components.textcat_multilabel.model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| 
 | 
 | ||||||
|  | @ -542,14 +544,15 @@ nO = null | ||||||
| width = ${components.tok2vec.model.encode.width} | width = ${components.tok2vec.model.encode.width} | ||||||
| 
 | 
 | ||||||
| [components.textcat.model.linear_model] | [components.textcat.model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| 
 | 
 | ||||||
| {% else -%} | {% else -%} | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
|  | @ -570,15 +573,17 @@ nO = null | ||||||
| width = ${components.tok2vec.model.encode.width} | width = ${components.tok2vec.model.encode.width} | ||||||
| 
 | 
 | ||||||
| [components.textcat_multilabel.model.linear_model] | [components.textcat_multilabel.model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| 
 | 
 | ||||||
| {% else -%} | {% else -%} | ||||||
| [components.textcat_multilabel.model] | [components.textcat_multilabel.model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| {%- endif %} | {%- endif %} | ||||||
|  |  | ||||||
|  | @ -142,7 +142,25 @@ class SpanRenderer: | ||||||
|         spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. |         spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. | ||||||
|         title (str / None): Document title set in Doc.user_data['title']. |         title (str / None): Document title set in Doc.user_data['title']. | ||||||
|         """ |         """ | ||||||
|         per_token_info = [] |         per_token_info = self._assemble_per_token_info(tokens, spans) | ||||||
|  |         markup = self._render_markup(per_token_info) | ||||||
|  |         markup = TPL_SPANS.format(content=markup, dir=self.direction) | ||||||
|  |         if title: | ||||||
|  |             markup = TPL_TITLE.format(title=title) + markup | ||||||
|  |         return markup | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def _assemble_per_token_info( | ||||||
|  |         tokens: List[str], spans: List[Dict[str, Any]] | ||||||
|  |     ) -> List[Dict[str, List[Dict[str, Any]]]]: | ||||||
|  |         """Assembles token info used to generate markup in render_spans(). | ||||||
|  |         tokens (List[str]): Tokens in text. | ||||||
|  |         spans (List[Dict[str, Any]]): Spans in text. | ||||||
|  |         RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens | ||||||
|  |             and spans. | ||||||
|  |         """ | ||||||
|  |         per_token_info: List[Dict[str, List[Dict[str, Any]]]] = [] | ||||||
|  | 
 | ||||||
|         # we must sort so that we can correctly describe when spans need to "stack" |         # we must sort so that we can correctly describe when spans need to "stack" | ||||||
|         # which is determined by their start token, then span length (longer spans on top), |         # which is determined by their start token, then span length (longer spans on top), | ||||||
|         # then break any remaining ties with the span label |         # then break any remaining ties with the span label | ||||||
|  | @ -154,21 +172,22 @@ class SpanRenderer: | ||||||
|                 s["label"], |                 s["label"], | ||||||
|             ), |             ), | ||||||
|         ) |         ) | ||||||
|  | 
 | ||||||
|         for s in spans: |         for s in spans: | ||||||
|             # this is the vertical 'slot' that the span will be rendered in |             # this is the vertical 'slot' that the span will be rendered in | ||||||
|             # vertical_position = span_label_offset + (offset_step * (slot - 1)) |             # vertical_position = span_label_offset + (offset_step * (slot - 1)) | ||||||
|             s["render_slot"] = 0 |             s["render_slot"] = 0 | ||||||
|  | 
 | ||||||
|         for idx, token in enumerate(tokens): |         for idx, token in enumerate(tokens): | ||||||
|             # Identify if a token belongs to a Span (and which) and if it's a |             # Identify if a token belongs to a Span (and which) and if it's a | ||||||
|             # start token of said Span. We'll use this for the final HTML render |             # start token of said Span. We'll use this for the final HTML render | ||||||
|             token_markup: Dict[str, Any] = {} |             token_markup: Dict[str, Any] = {} | ||||||
|             token_markup["text"] = token |             token_markup["text"] = token | ||||||
|             concurrent_spans = 0 |             intersecting_spans: List[Dict[str, Any]] = [] | ||||||
|             entities = [] |             entities = [] | ||||||
|             for span in spans: |             for span in spans: | ||||||
|                 ent = {} |                 ent = {} | ||||||
|                 if span["start_token"] <= idx < span["end_token"]: |                 if span["start_token"] <= idx < span["end_token"]: | ||||||
|                     concurrent_spans += 1 |  | ||||||
|                     span_start = idx == span["start_token"] |                     span_start = idx == span["start_token"] | ||||||
|                     ent["label"] = span["label"] |                     ent["label"] = span["label"] | ||||||
|                     ent["is_start"] = span_start |                     ent["is_start"] = span_start | ||||||
|  | @ -176,7 +195,12 @@ class SpanRenderer: | ||||||
|                         # When the span starts, we need to know how many other |                         # When the span starts, we need to know how many other | ||||||
|                         # spans are on the 'span stack' and will be rendered. |                         # spans are on the 'span stack' and will be rendered. | ||||||
|                         # This value becomes the vertical render slot for this entire span |                         # This value becomes the vertical render slot for this entire span | ||||||
|                         span["render_slot"] = concurrent_spans |                         span["render_slot"] = ( | ||||||
|  |                             intersecting_spans[-1]["render_slot"] | ||||||
|  |                             if len(intersecting_spans) | ||||||
|  |                             else 0 | ||||||
|  |                         ) + 1 | ||||||
|  |                     intersecting_spans.append(span) | ||||||
|                     ent["render_slot"] = span["render_slot"] |                     ent["render_slot"] = span["render_slot"] | ||||||
|                     kb_id = span.get("kb_id", "") |                     kb_id = span.get("kb_id", "") | ||||||
|                     kb_url = span.get("kb_url", "#") |                     kb_url = span.get("kb_url", "#") | ||||||
|  | @ -193,11 +217,8 @@ class SpanRenderer: | ||||||
|                     span["render_slot"] = 0 |                     span["render_slot"] = 0 | ||||||
|             token_markup["entities"] = entities |             token_markup["entities"] = entities | ||||||
|             per_token_info.append(token_markup) |             per_token_info.append(token_markup) | ||||||
|         markup = self._render_markup(per_token_info) | 
 | ||||||
|         markup = TPL_SPANS.format(content=markup, dir=self.direction) |         return per_token_info | ||||||
|         if title: |  | ||||||
|             markup = TPL_TITLE.format(title=title) + markup |  | ||||||
|         return markup |  | ||||||
| 
 | 
 | ||||||
|     def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: |     def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: | ||||||
|         """Render the markup from per-token information""" |         """Render the markup from per-token information""" | ||||||
|  |  | ||||||
|  | @ -227,7 +227,6 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|     E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " |     E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " | ||||||
|             "This usually happens when spaCy calls `nlp.{method}` with a custom " |             "This usually happens when spaCy calls `nlp.{method}` with a custom " | ||||||
|             "component name that's not registered on the current language class. " |             "component name that's not registered on the current language class. " | ||||||
|             "If you're using a Transformer, make sure to install 'spacy-transformers'. " |  | ||||||
|             "If you're using a custom component, make sure you've added the " |             "If you're using a custom component, make sure you've added the " | ||||||
|             "decorator `@Language.component` (for function components) or " |             "decorator `@Language.component` (for function components) or " | ||||||
|             "`@Language.factory` (for class components).\n\nAvailable " |             "`@Language.factory` (for class components).\n\nAvailable " | ||||||
|  | @ -984,6 +983,10 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|              "predicted docs when training {component}.") |              "predicted docs when training {component}.") | ||||||
|     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " |     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " | ||||||
|              "but only callbacks with one or three parameters are supported") |              "but only callbacks with one or three parameters are supported") | ||||||
|  |     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.") | ||||||
|  |     E1057 = ("The `TextCatReduce` architecture must be used with at least one " | ||||||
|  |              "reduction. Please enable one of `use_reduce_first`, " | ||||||
|  |              "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Deprecated model shortcuts, only used in errors and warnings | # Deprecated model shortcuts, only used in errors and warnings | ||||||
|  |  | ||||||
|  | @ -1,3 +1,11 @@ | ||||||
| from .candidate import Candidate, get_candidates, get_candidates_batch | from .candidate import Candidate, get_candidates, get_candidates_batch | ||||||
| from .kb import KnowledgeBase | from .kb import KnowledgeBase | ||||||
| from .kb_in_memory import InMemoryLookupKB | from .kb_in_memory import InMemoryLookupKB | ||||||
|  | 
 | ||||||
|  | __all__ = [ | ||||||
|  |     "Candidate", | ||||||
|  |     "KnowledgeBase", | ||||||
|  |     "InMemoryLookupKB", | ||||||
|  |     "get_candidates", | ||||||
|  |     "get_candidates_batch", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | @ -6,7 +6,8 @@ _num_words = [ | ||||||
|     "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", |     "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", | ||||||
|     "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", |     "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", | ||||||
|     "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", |     "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", | ||||||
|     "million", "billion", "trillion", "quadrillion", "gajillion", "bazillion" |     "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion", | ||||||
|  |     "septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion" | ||||||
| ] | ] | ||||||
| _ordinal_words = [ | _ordinal_words = [ | ||||||
|     "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", |     "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", | ||||||
|  | @ -14,7 +15,8 @@ _ordinal_words = [ | ||||||
|     "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", |     "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", | ||||||
|     "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", |     "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", | ||||||
|     "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth", |     "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth", | ||||||
|     "trillionth", "quadrillionth", "gajillionth", "bazillionth" |     "trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth", | ||||||
|  |     "octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth" | ||||||
| ] | ] | ||||||
| # fmt: on | # fmt: on | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										18
									
								
								spacy/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,18 @@ | ||||||
|  | from ...language import BaseDefaults, Language | ||||||
|  | from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class FaroeseDefaults(BaseDefaults): | ||||||
|  |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||||
|  |     infixes = TOKENIZER_INFIXES | ||||||
|  |     suffixes = TOKENIZER_SUFFIXES | ||||||
|  |     prefixes = TOKENIZER_PREFIXES | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Faroese(Language): | ||||||
|  |     lang = "fo" | ||||||
|  |     Defaults = FaroeseDefaults | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | __all__ = ["Faroese"] | ||||||
							
								
								
									
										90
									
								
								spacy/lang/fo/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,90 @@ | ||||||
|  | from ...symbols import ORTH | ||||||
|  | from ...util import update_exc | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | 
 | ||||||
|  | _exc = {} | ||||||
|  | 
 | ||||||
|  | for orth in [ | ||||||
|  |     "apr.", | ||||||
|  |     "aug.", | ||||||
|  |     "avgr.", | ||||||
|  |     "árg.", | ||||||
|  |     "ávís.", | ||||||
|  |     "beinl.", | ||||||
|  |     "blkv.", | ||||||
|  |     "blaðkv.", | ||||||
|  |     "blm.", | ||||||
|  |     "blaðm.", | ||||||
|  |     "bls.", | ||||||
|  |     "blstj.", | ||||||
|  |     "blaðstj.", | ||||||
|  |     "des.", | ||||||
|  |     "eint.", | ||||||
|  |     "febr.", | ||||||
|  |     "fyrrv.", | ||||||
|  |     "góðk.", | ||||||
|  |     "h.m.", | ||||||
|  |     "innt.", | ||||||
|  |     "jan.", | ||||||
|  |     "kl.", | ||||||
|  |     "m.a.", | ||||||
|  |     "mðr.", | ||||||
|  |     "mió.", | ||||||
|  |     "nr.", | ||||||
|  |     "nto.", | ||||||
|  |     "nov.", | ||||||
|  |     "nút.", | ||||||
|  |     "o.a.", | ||||||
|  |     "o.a.m.", | ||||||
|  |     "o.a.tíl.", | ||||||
|  |     "o.fl.", | ||||||
|  |     "ff.", | ||||||
|  |     "o.m.a.", | ||||||
|  |     "o.o.", | ||||||
|  |     "o.s.fr.", | ||||||
|  |     "o.tíl.", | ||||||
|  |     "o.ø.", | ||||||
|  |     "okt.", | ||||||
|  |     "omf.", | ||||||
|  |     "pst.", | ||||||
|  |     "ritstj.", | ||||||
|  |     "sbr.", | ||||||
|  |     "sms.", | ||||||
|  |     "smst.", | ||||||
|  |     "smb.", | ||||||
|  |     "sb.", | ||||||
|  |     "sbrt.", | ||||||
|  |     "sp.", | ||||||
|  |     "sept.", | ||||||
|  |     "spf.", | ||||||
|  |     "spsk.", | ||||||
|  |     "t.e.", | ||||||
|  |     "t.s.", | ||||||
|  |     "t.s.s.", | ||||||
|  |     "tlf.", | ||||||
|  |     "tel.", | ||||||
|  |     "tsk.", | ||||||
|  |     "t.o.v.", | ||||||
|  |     "t.d.", | ||||||
|  |     "uml.", | ||||||
|  |     "ums.", | ||||||
|  |     "uppl.", | ||||||
|  |     "upprfr.", | ||||||
|  |     "uppr.", | ||||||
|  |     "útg.", | ||||||
|  |     "útl.", | ||||||
|  |     "útr.", | ||||||
|  |     "vanl.", | ||||||
|  |     "v.", | ||||||
|  |     "v.h.", | ||||||
|  |     "v.ø.o.", | ||||||
|  |     "viðm.", | ||||||
|  |     "viðv.", | ||||||
|  |     "vm.", | ||||||
|  |     "v.m.", | ||||||
|  | ]: | ||||||
|  |     _exc[orth] = [{ORTH: orth}] | ||||||
|  |     capitalized = orth.capitalize() | ||||||
|  |     _exc[capitalized] = [{ORTH: capitalized}] | ||||||
|  | 
 | ||||||
|  | TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||||
							
								
								
									
										20
									
								
								spacy/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,20 @@ | ||||||
|  | from ...language import BaseDefaults, Language | ||||||
|  | from ..nb import SYNTAX_ITERATORS | ||||||
|  | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class NorwegianNynorskDefaults(BaseDefaults): | ||||||
|  |     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||||
|  |     prefixes = TOKENIZER_PREFIXES | ||||||
|  |     infixes = TOKENIZER_INFIXES | ||||||
|  |     suffixes = TOKENIZER_SUFFIXES | ||||||
|  |     syntax_iterators = SYNTAX_ITERATORS | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class NorwegianNynorsk(Language): | ||||||
|  |     lang = "nn" | ||||||
|  |     Defaults = NorwegianNynorskDefaults | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | __all__ = ["NorwegianNynorsk"] | ||||||
							
								
								
									
										15
									
								
								spacy/lang/nn/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,15 @@ | ||||||
|  | """ | ||||||
|  | Example sentences to test spaCy and its language models. | ||||||
|  | 
 | ||||||
|  | >>> from spacy.lang.nn.examples import sentences | ||||||
|  | >>> docs = nlp.pipe(sentences) | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) | ||||||
|  | sentences = [ | ||||||
|  |     "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", | ||||||
|  |     "Det er ein meir enn i same periode i fjor.", | ||||||
|  |     "Det har lava ned enorme snømengder i store delar av Europa den siste tida.", | ||||||
|  |     "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.", | ||||||
|  | ] | ||||||
							
								
								
									
										74
									
								
								spacy/lang/nn/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,74 @@ | ||||||
|  | from ..char_classes import ( | ||||||
|  |     ALPHA, | ||||||
|  |     ALPHA_LOWER, | ||||||
|  |     ALPHA_UPPER, | ||||||
|  |     CONCAT_QUOTES, | ||||||
|  |     CURRENCY, | ||||||
|  |     LIST_CURRENCY, | ||||||
|  |     LIST_ELLIPSES, | ||||||
|  |     LIST_ICONS, | ||||||
|  |     LIST_PUNCT, | ||||||
|  |     LIST_QUOTES, | ||||||
|  |     PUNCT, | ||||||
|  |     UNITS, | ||||||
|  | ) | ||||||
|  | from ..punctuation import TOKENIZER_SUFFIXES | ||||||
|  | 
 | ||||||
|  | _quotes = CONCAT_QUOTES.replace("'", "") | ||||||
|  | _list_punct = [x for x in LIST_PUNCT if x != "#"] | ||||||
|  | _list_icons = [x for x in LIST_ICONS if x != "°"] | ||||||
|  | _list_icons = [x.replace("\\u00B0", "") for x in _list_icons] | ||||||
|  | _list_quotes = [x for x in LIST_QUOTES if x != "\\'"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _prefixes = ( | ||||||
|  |     ["§", "%", "=", "—", "–", r"\+(?![0-9])"] | ||||||
|  |     + _list_punct | ||||||
|  |     + LIST_ELLIPSES | ||||||
|  |     + LIST_QUOTES | ||||||
|  |     + LIST_CURRENCY | ||||||
|  |     + LIST_ICONS | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _infixes = ( | ||||||
|  |     LIST_ELLIPSES | ||||||
|  |     + _list_icons | ||||||
|  |     + [ | ||||||
|  |         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||||
|  |         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||||
|  |         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||||
|  |     ] | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | _suffixes = ( | ||||||
|  |     LIST_PUNCT | ||||||
|  |     + LIST_ELLIPSES | ||||||
|  |     + _list_quotes | ||||||
|  |     + _list_icons | ||||||
|  |     + ["—", "–"] | ||||||
|  |     + [ | ||||||
|  |         r"(?<=[0-9])\+", | ||||||
|  |         r"(?<=°[FfCcKk])\.", | ||||||
|  |         r"(?<=[0-9])(?:{c})".format(c=CURRENCY), | ||||||
|  |         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||||
|  |         r"(?<=[{al}{e}{p}(?:{q})])\.".format( | ||||||
|  |             al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT | ||||||
|  |         ), | ||||||
|  |         r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), | ||||||
|  |     ] | ||||||
|  |     + [r"(?<=[^sSxXzZ])'"] | ||||||
|  | ) | ||||||
|  | _suffixes += [ | ||||||
|  |     suffix | ||||||
|  |     for suffix in TOKENIZER_SUFFIXES | ||||||
|  |     if suffix not in ["'s", "'S", "’s", "’S", r"\'"] | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TOKENIZER_PREFIXES = _prefixes | ||||||
|  | TOKENIZER_INFIXES = _infixes | ||||||
|  | TOKENIZER_SUFFIXES = _suffixes | ||||||
							
								
								
									
										228
									
								
								spacy/lang/nn/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,228 @@ | ||||||
|  | from ...symbols import NORM, ORTH | ||||||
|  | from ...util import update_exc | ||||||
|  | from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||||
|  | 
 | ||||||
|  | _exc = {} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | for exc_data in [ | ||||||
|  |     {ORTH: "jan.", NORM: "januar"}, | ||||||
|  |     {ORTH: "feb.", NORM: "februar"}, | ||||||
|  |     {ORTH: "mar.", NORM: "mars"}, | ||||||
|  |     {ORTH: "apr.", NORM: "april"}, | ||||||
|  |     {ORTH: "jun.", NORM: "juni"}, | ||||||
|  |     # note: "jul." is in the simple list below without a NORM exception | ||||||
|  |     {ORTH: "aug.", NORM: "august"}, | ||||||
|  |     {ORTH: "sep.", NORM: "september"}, | ||||||
|  |     {ORTH: "okt.", NORM: "oktober"}, | ||||||
|  |     {ORTH: "nov.", NORM: "november"}, | ||||||
|  |     {ORTH: "des.", NORM: "desember"}, | ||||||
|  | ]: | ||||||
|  |     _exc[exc_data[ORTH]] = [exc_data] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | for orth in [ | ||||||
|  |     "Ap.", | ||||||
|  |     "Aq.", | ||||||
|  |     "Ca.", | ||||||
|  |     "Chr.", | ||||||
|  |     "Co.", | ||||||
|  |     "Dr.", | ||||||
|  |     "F.eks.", | ||||||
|  |     "Fr.p.", | ||||||
|  |     "Frp.", | ||||||
|  |     "Grl.", | ||||||
|  |     "Kr.", | ||||||
|  |     "Kr.F.", | ||||||
|  |     "Kr.F.s", | ||||||
|  |     "Mr.", | ||||||
|  |     "Mrs.", | ||||||
|  |     "Pb.", | ||||||
|  |     "Pr.", | ||||||
|  |     "Sp.", | ||||||
|  |     "St.", | ||||||
|  |     "a.m.", | ||||||
|  |     "ad.", | ||||||
|  |     "adm.dir.", | ||||||
|  |     "adr.", | ||||||
|  |     "b.c.", | ||||||
|  |     "bl.a.", | ||||||
|  |     "bla.", | ||||||
|  |     "bm.", | ||||||
|  |     "bnr.", | ||||||
|  |     "bto.", | ||||||
|  |     "c.c.", | ||||||
|  |     "ca.", | ||||||
|  |     "cand.mag.", | ||||||
|  |     "co.", | ||||||
|  |     "d.d.", | ||||||
|  |     "d.m.", | ||||||
|  |     "d.y.", | ||||||
|  |     "dept.", | ||||||
|  |     "dr.", | ||||||
|  |     "dr.med.", | ||||||
|  |     "dr.philos.", | ||||||
|  |     "dr.psychol.", | ||||||
|  |     "dss.", | ||||||
|  |     "dvs.", | ||||||
|  |     "e.Kr.", | ||||||
|  |     "e.l.", | ||||||
|  |     "eg.", | ||||||
|  |     "eig.", | ||||||
|  |     "ekskl.", | ||||||
|  |     "el.", | ||||||
|  |     "et.", | ||||||
|  |     "etc.", | ||||||
|  |     "etg.", | ||||||
|  |     "ev.", | ||||||
|  |     "evt.", | ||||||
|  |     "f.", | ||||||
|  |     "f.Kr.", | ||||||
|  |     "f.eks.", | ||||||
|  |     "f.o.m.", | ||||||
|  |     "fhv.", | ||||||
|  |     "fk.", | ||||||
|  |     "foreg.", | ||||||
|  |     "fork.", | ||||||
|  |     "fv.", | ||||||
|  |     "fvt.", | ||||||
|  |     "g.", | ||||||
|  |     "gl.", | ||||||
|  |     "gno.", | ||||||
|  |     "gnr.", | ||||||
|  |     "grl.", | ||||||
|  |     "gt.", | ||||||
|  |     "h.r.adv.", | ||||||
|  |     "hhv.", | ||||||
|  |     "hoh.", | ||||||
|  |     "hr.", | ||||||
|  |     "ifb.", | ||||||
|  |     "ifm.", | ||||||
|  |     "iht.", | ||||||
|  |     "inkl.", | ||||||
|  |     "istf.", | ||||||
|  |     "jf.", | ||||||
|  |     "jr.", | ||||||
|  |     "jul.", | ||||||
|  |     "juris.", | ||||||
|  |     "kfr.", | ||||||
|  |     "kgl.", | ||||||
|  |     "kgl.res.", | ||||||
|  |     "kl.", | ||||||
|  |     "komm.", | ||||||
|  |     "kr.", | ||||||
|  |     "kst.", | ||||||
|  |     "lat.", | ||||||
|  |     "lø.", | ||||||
|  |     "m.a.", | ||||||
|  |     "m.a.o.", | ||||||
|  |     "m.fl.", | ||||||
|  |     "m.m.", | ||||||
|  |     "m.v.", | ||||||
|  |     "ma.", | ||||||
|  |     "mag.art.", | ||||||
|  |     "md.", | ||||||
|  |     "mfl.", | ||||||
|  |     "mht.", | ||||||
|  |     "mill.", | ||||||
|  |     "min.", | ||||||
|  |     "mnd.", | ||||||
|  |     "moh.", | ||||||
|  |     "mrd.", | ||||||
|  |     "muh.", | ||||||
|  |     "mv.", | ||||||
|  |     "mva.", | ||||||
|  |     "n.å.", | ||||||
|  |     "ndf.", | ||||||
|  |     "nr.", | ||||||
|  |     "nto.", | ||||||
|  |     "nyno.", | ||||||
|  |     "o.a.", | ||||||
|  |     "o.l.", | ||||||
|  |     "obl.", | ||||||
|  |     "off.", | ||||||
|  |     "ofl.", | ||||||
|  |     "on.", | ||||||
|  |     "op.", | ||||||
|  |     "org.", | ||||||
|  |     "osv.", | ||||||
|  |     "ovf.", | ||||||
|  |     "p.", | ||||||
|  |     "p.a.", | ||||||
|  |     "p.g.a.", | ||||||
|  |     "p.m.", | ||||||
|  |     "p.t.", | ||||||
|  |     "pga.", | ||||||
|  |     "ph.d.", | ||||||
|  |     "pkt.", | ||||||
|  |     "pr.", | ||||||
|  |     "pst.", | ||||||
|  |     "pt.", | ||||||
|  |     "red.anm.", | ||||||
|  |     "ref.", | ||||||
|  |     "res.", | ||||||
|  |     "res.kap.", | ||||||
|  |     "resp.", | ||||||
|  |     "rv.", | ||||||
|  |     "s.", | ||||||
|  |     "s.d.", | ||||||
|  |     "s.k.", | ||||||
|  |     "s.u.", | ||||||
|  |     "s.å.", | ||||||
|  |     "sen.", | ||||||
|  |     "sep.", | ||||||
|  |     "siviling.", | ||||||
|  |     "sms.", | ||||||
|  |     "snr.", | ||||||
|  |     "spm.", | ||||||
|  |     "sr.", | ||||||
|  |     "sst.", | ||||||
|  |     "st.", | ||||||
|  |     "st.meld.", | ||||||
|  |     "st.prp.", | ||||||
|  |     "stip.", | ||||||
|  |     "stk.", | ||||||
|  |     "stud.", | ||||||
|  |     "sv.", | ||||||
|  |     "såk.", | ||||||
|  |     "sø.", | ||||||
|  |     "t.d.", | ||||||
|  |     "t.h.", | ||||||
|  |     "t.o.m.", | ||||||
|  |     "t.v.", | ||||||
|  |     "temp.", | ||||||
|  |     "ti.", | ||||||
|  |     "tils.", | ||||||
|  |     "tilsv.", | ||||||
|  |     "tl;dr", | ||||||
|  |     "tlf.", | ||||||
|  |     "to.", | ||||||
|  |     "ult.", | ||||||
|  |     "utg.", | ||||||
|  |     "v.", | ||||||
|  |     "vedk.", | ||||||
|  |     "vedr.", | ||||||
|  |     "vg.", | ||||||
|  |     "vgs.", | ||||||
|  |     "vha.", | ||||||
|  |     "vit.ass.", | ||||||
|  |     "vn.", | ||||||
|  |     "vol.", | ||||||
|  |     "vs.", | ||||||
|  |     "vsa.", | ||||||
|  |     "§§", | ||||||
|  |     "©NTB", | ||||||
|  |     "årg.", | ||||||
|  |     "årh.", | ||||||
|  | ]: | ||||||
|  |     _exc[orth] = [{ORTH: orth}] | ||||||
|  | 
 | ||||||
|  | # Dates | ||||||
|  | for h in range(1, 31 + 1): | ||||||
|  |     for period in ["."]: | ||||||
|  |         _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] | ||||||
|  | 
 | ||||||
|  | _custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]} | ||||||
|  | _exc.update(_custom_base_exc) | ||||||
|  | 
 | ||||||
|  | TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||||
|  | @ -1683,6 +1683,12 @@ class Language: | ||||||
|         for proc in procs: |         for proc in procs: | ||||||
|             proc.start() |             proc.start() | ||||||
| 
 | 
 | ||||||
|  |         # Close writing-end of channels. This is needed to avoid that reading | ||||||
|  |         # from the channel blocks indefinitely when the worker closes the | ||||||
|  |         # channel. | ||||||
|  |         for tx in bytedocs_send_ch: | ||||||
|  |             tx.close() | ||||||
|  | 
 | ||||||
|         # Cycle channels not to break the order of docs. |         # Cycle channels not to break the order of docs. | ||||||
|         # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. |         # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. | ||||||
|         byte_tuples = chain.from_iterable( |         byte_tuples = chain.from_iterable( | ||||||
|  | @ -1705,8 +1711,23 @@ class Language: | ||||||
|                     # tell `sender` that one batch was consumed. |                     # tell `sender` that one batch was consumed. | ||||||
|                     sender.step() |                     sender.step() | ||||||
|         finally: |         finally: | ||||||
|  |             # If we are stopping in an orderly fashion, the workers' queues | ||||||
|  |             # are empty. Put the sentinel in their queues to signal that work | ||||||
|  |             # is done, so that they can exit gracefully. | ||||||
|  |             for q in texts_q: | ||||||
|  |                 q.put(_WORK_DONE_SENTINEL) | ||||||
|  | 
 | ||||||
|  |             # Otherwise, we are stopping because the error handler raised an | ||||||
|  |             # exception. The sentinel will be last to go out of the queue. | ||||||
|  |             # To avoid doing unnecessary work or hanging on platforms that | ||||||
|  |             # block on sending (Windows), we'll close our end of the channel. | ||||||
|  |             # This signals to the worker that it can exit the next time it | ||||||
|  |             # attempts to send data down the channel. | ||||||
|  |             for r in bytedocs_recv_ch: | ||||||
|  |                 r.close() | ||||||
|  | 
 | ||||||
|             for proc in procs: |             for proc in procs: | ||||||
|                 proc.terminate() |                 proc.join() | ||||||
| 
 | 
 | ||||||
|     def _link_components(self) -> None: |     def _link_components(self) -> None: | ||||||
|         """Register 'listeners' within pipeline components, to allow them to |         """Register 'listeners' within pipeline components, to allow them to | ||||||
|  | @ -2323,6 +2344,11 @@ def _apply_pipes( | ||||||
|     while True: |     while True: | ||||||
|         try: |         try: | ||||||
|             texts_with_ctx = receiver.get() |             texts_with_ctx = receiver.get() | ||||||
|  | 
 | ||||||
|  |             # Stop working if we encounter the end-of-work sentinel. | ||||||
|  |             if isinstance(texts_with_ctx, _WorkDoneSentinel): | ||||||
|  |                 return | ||||||
|  | 
 | ||||||
|             docs = ( |             docs = ( | ||||||
|                 ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx |                 ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx | ||||||
|             ) |             ) | ||||||
|  | @ -2331,11 +2357,21 @@ def _apply_pipes( | ||||||
|             # Connection does not accept unpickable objects, so send list. |             # Connection does not accept unpickable objects, so send list. | ||||||
|             byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs] |             byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs] | ||||||
|             padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs)) |             padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs)) | ||||||
|             sender.send(byte_docs + padding)  # type: ignore[operator] |             data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = ( | ||||||
|  |                 byte_docs + padding  # type: ignore[operator] | ||||||
|  |             ) | ||||||
|         except Exception: |         except Exception: | ||||||
|             error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))] |             error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))] | ||||||
|             padding = [(None, None, None)] * (len(texts_with_ctx) - 1) |             padding = [(None, None, None)] * (len(texts_with_ctx) - 1) | ||||||
|             sender.send(error_msg + padding) |             data = error_msg + padding | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             sender.send(data) | ||||||
|  |         except BrokenPipeError: | ||||||
|  |             # Parent has closed the pipe prematurely. This happens when a | ||||||
|  |             # worker encounters an error and the error handler is set to | ||||||
|  |             # stop processing. | ||||||
|  |             return | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class _Sender: | class _Sender: | ||||||
|  | @ -2365,3 +2401,10 @@ class _Sender: | ||||||
|         if self.count >= self.chunk_size: |         if self.count >= self.chunk_size: | ||||||
|             self.count = 0 |             self.count = 0 | ||||||
|             self.send() |             self.send() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class _WorkDoneSentinel: | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _WORK_DONE_SENTINEL = _WorkDoneSentinel() | ||||||
|  |  | ||||||
|  | @ -3,4 +3,4 @@ from .levenshtein import levenshtein | ||||||
| from .matcher import Matcher | from .matcher import Matcher | ||||||
| from .phrasematcher import PhraseMatcher | from .phrasematcher import PhraseMatcher | ||||||
| 
 | 
 | ||||||
| __all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"] | __all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"] | ||||||
|  |  | ||||||
|  | @ -1,21 +1,27 @@ | ||||||
| from functools import partial | from functools import partial | ||||||
| from typing import List, Optional, cast | from typing import List, Optional, Tuple, cast | ||||||
| 
 | 
 | ||||||
| from thinc.api import ( | from thinc.api import ( | ||||||
|     Dropout, |     Dropout, | ||||||
|  |     Gelu, | ||||||
|     LayerNorm, |     LayerNorm, | ||||||
|     Linear, |     Linear, | ||||||
|     Logistic, |     Logistic, | ||||||
|     Maxout, |     Maxout, | ||||||
|     Model, |     Model, | ||||||
|     ParametricAttention, |     ParametricAttention, | ||||||
|  |     ParametricAttention_v2, | ||||||
|     Relu, |     Relu, | ||||||
|     Softmax, |     Softmax, | ||||||
|     SparseLinear, |     SparseLinear, | ||||||
|  |     SparseLinear_v2, | ||||||
|     chain, |     chain, | ||||||
|     clone, |     clone, | ||||||
|     concatenate, |     concatenate, | ||||||
|     list2ragged, |     list2ragged, | ||||||
|  |     reduce_first, | ||||||
|  |     reduce_last, | ||||||
|  |     reduce_max, | ||||||
|     reduce_mean, |     reduce_mean, | ||||||
|     reduce_sum, |     reduce_sum, | ||||||
|     residual, |     residual, | ||||||
|  | @ -25,9 +31,10 @@ from thinc.api import ( | ||||||
| ) | ) | ||||||
| from thinc.layers.chain import init as init_chain | from thinc.layers.chain import init as init_chain | ||||||
| from thinc.layers.resizable import resize_linear_weighted, resize_model | from thinc.layers.resizable import resize_linear_weighted, resize_model | ||||||
| from thinc.types import Floats2d | from thinc.types import ArrayXd, Floats2d | ||||||
| 
 | 
 | ||||||
| from ...attrs import ORTH | from ...attrs import ORTH | ||||||
|  | from ...errors import Errors | ||||||
| from ...tokens import Doc | from ...tokens import Doc | ||||||
| from ...util import registry | from ...util import registry | ||||||
| from ..extract_ngrams import extract_ngrams | from ..extract_ngrams import extract_ngrams | ||||||
|  | @ -47,39 +54,15 @@ def build_simple_cnn_text_classifier( | ||||||
|     outputs sum to 1. If exclusive_classes=False, a logistic non-linearity |     outputs sum to 1. If exclusive_classes=False, a logistic non-linearity | ||||||
|     is applied instead, so that outputs are in the range [0, 1]. |     is applied instead, so that outputs are in the range [0, 1]. | ||||||
|     """ |     """ | ||||||
|     fill_defaults = {"b": 0, "W": 0} |     return build_reduce_text_classifier( | ||||||
|     with Model.define_operators({">>": chain}): |         tok2vec=tok2vec, | ||||||
|         cnn = tok2vec >> list2ragged() >> reduce_mean() |         exclusive_classes=exclusive_classes, | ||||||
|         nI = tok2vec.maybe_get_dim("nO") |         use_reduce_first=False, | ||||||
|         if exclusive_classes: |         use_reduce_last=False, | ||||||
|             output_layer = Softmax(nO=nO, nI=nI) |         use_reduce_max=False, | ||||||
|             fill_defaults["b"] = NEG_VALUE |         use_reduce_mean=True, | ||||||
|             resizable_layer: Model = resizable( |         nO=nO, | ||||||
|                 output_layer, |     ) | ||||||
|                 resize_layer=partial( |  | ||||||
|                     resize_linear_weighted, fill_defaults=fill_defaults |  | ||||||
|                 ), |  | ||||||
|             ) |  | ||||||
|             model = cnn >> resizable_layer |  | ||||||
|         else: |  | ||||||
|             output_layer = Linear(nO=nO, nI=nI) |  | ||||||
|             resizable_layer = resizable( |  | ||||||
|                 output_layer, |  | ||||||
|                 resize_layer=partial( |  | ||||||
|                     resize_linear_weighted, fill_defaults=fill_defaults |  | ||||||
|                 ), |  | ||||||
|             ) |  | ||||||
|             model = cnn >> resizable_layer >> Logistic() |  | ||||||
|         model.set_ref("output_layer", output_layer) |  | ||||||
|         model.attrs["resize_output"] = partial( |  | ||||||
|             resize_and_set_ref, |  | ||||||
|             resizable_layer=resizable_layer, |  | ||||||
|         ) |  | ||||||
|     model.set_ref("tok2vec", tok2vec) |  | ||||||
|     if nO is not None: |  | ||||||
|         model.set_dim("nO", cast(int, nO)) |  | ||||||
|     model.attrs["multi_label"] = not exclusive_classes |  | ||||||
|     return model |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def resize_and_set_ref(model, new_nO, resizable_layer): | def resize_and_set_ref(model, new_nO, resizable_layer): | ||||||
|  | @ -95,10 +78,48 @@ def build_bow_text_classifier( | ||||||
|     ngram_size: int, |     ngram_size: int, | ||||||
|     no_output_layer: bool, |     no_output_layer: bool, | ||||||
|     nO: Optional[int] = None, |     nO: Optional[int] = None, | ||||||
|  | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     return _build_bow_text_classifier( | ||||||
|  |         exclusive_classes=exclusive_classes, | ||||||
|  |         ngram_size=ngram_size, | ||||||
|  |         no_output_layer=no_output_layer, | ||||||
|  |         nO=nO, | ||||||
|  |         sparse_linear=SparseLinear(nO=nO), | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @registry.architectures("spacy.TextCatBOW.v3") | ||||||
|  | def build_bow_text_classifier_v3( | ||||||
|  |     exclusive_classes: bool, | ||||||
|  |     ngram_size: int, | ||||||
|  |     no_output_layer: bool, | ||||||
|  |     length: int = 262144, | ||||||
|  |     nO: Optional[int] = None, | ||||||
|  | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     if length < 1: | ||||||
|  |         raise ValueError(Errors.E1056.format(length=length)) | ||||||
|  | 
 | ||||||
|  |     # Find k such that 2**(k-1) < length <= 2**k. | ||||||
|  |     length = 2 ** (length - 1).bit_length() | ||||||
|  | 
 | ||||||
|  |     return _build_bow_text_classifier( | ||||||
|  |         exclusive_classes=exclusive_classes, | ||||||
|  |         ngram_size=ngram_size, | ||||||
|  |         no_output_layer=no_output_layer, | ||||||
|  |         nO=nO, | ||||||
|  |         sparse_linear=SparseLinear_v2(nO=nO, length=length), | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _build_bow_text_classifier( | ||||||
|  |     exclusive_classes: bool, | ||||||
|  |     ngram_size: int, | ||||||
|  |     no_output_layer: bool, | ||||||
|  |     sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd], | ||||||
|  |     nO: Optional[int] = None, | ||||||
| ) -> Model[List[Doc], Floats2d]: | ) -> Model[List[Doc], Floats2d]: | ||||||
|     fill_defaults = {"b": 0, "W": 0} |     fill_defaults = {"b": 0, "W": 0} | ||||||
|     with Model.define_operators({">>": chain}): |     with Model.define_operators({">>": chain}): | ||||||
|         sparse_linear = SparseLinear(nO=nO) |  | ||||||
|         output_layer = None |         output_layer = None | ||||||
|         if not no_output_layer: |         if not no_output_layer: | ||||||
|             fill_defaults["b"] = NEG_VALUE |             fill_defaults["b"] = NEG_VALUE | ||||||
|  | @ -127,6 +148,9 @@ def build_text_classifier_v2( | ||||||
|     linear_model: Model[List[Doc], Floats2d], |     linear_model: Model[List[Doc], Floats2d], | ||||||
|     nO: Optional[int] = None, |     nO: Optional[int] = None, | ||||||
| ) -> Model[List[Doc], Floats2d]: | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     # TODO: build the model with _build_parametric_attention_with_residual_nonlinear | ||||||
|  |     # in spaCy v4. We don't do this in spaCy v3 to preserve model | ||||||
|  |     # compatibility. | ||||||
|     exclusive_classes = not linear_model.attrs["multi_label"] |     exclusive_classes = not linear_model.attrs["multi_label"] | ||||||
|     with Model.define_operators({">>": chain, "|": concatenate}): |     with Model.define_operators({">>": chain, "|": concatenate}): | ||||||
|         width = tok2vec.maybe_get_dim("nO") |         width = tok2vec.maybe_get_dim("nO") | ||||||
|  | @ -190,3 +214,145 @@ def build_text_classifier_lowdata( | ||||||
|             model = model >> Dropout(dropout) |             model = model >> Dropout(dropout) | ||||||
|         model = model >> Logistic() |         model = model >> Logistic() | ||||||
|     return model |     return model | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @registry.architectures("spacy.TextCatParametricAttention.v1") | ||||||
|  | def build_textcat_parametric_attention_v1( | ||||||
|  |     tok2vec: Model[List[Doc], List[Floats2d]], | ||||||
|  |     exclusive_classes: bool, | ||||||
|  |     nO: Optional[int] = None, | ||||||
|  | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     width = tok2vec.maybe_get_dim("nO") | ||||||
|  |     parametric_attention = _build_parametric_attention_with_residual_nonlinear( | ||||||
|  |         tok2vec=tok2vec, | ||||||
|  |         nonlinear_layer=Maxout(nI=width, nO=width), | ||||||
|  |         key_transform=Gelu(nI=width, nO=width), | ||||||
|  |     ) | ||||||
|  |     with Model.define_operators({">>": chain}): | ||||||
|  |         if exclusive_classes: | ||||||
|  |             output_layer = Softmax(nO=nO) | ||||||
|  |         else: | ||||||
|  |             output_layer = Linear(nO=nO) >> Logistic() | ||||||
|  |         model = parametric_attention >> output_layer | ||||||
|  |     if model.has_dim("nO") is not False and nO is not None: | ||||||
|  |         model.set_dim("nO", cast(int, nO)) | ||||||
|  |     model.set_ref("output_layer", output_layer) | ||||||
|  |     model.attrs["multi_label"] = not exclusive_classes | ||||||
|  | 
 | ||||||
|  |     return model | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _build_parametric_attention_with_residual_nonlinear( | ||||||
|  |     *, | ||||||
|  |     tok2vec: Model[List[Doc], List[Floats2d]], | ||||||
|  |     nonlinear_layer: Model[Floats2d, Floats2d], | ||||||
|  |     key_transform: Optional[Model[Floats2d, Floats2d]] = None, | ||||||
|  | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     with Model.define_operators({">>": chain, "|": concatenate}): | ||||||
|  |         width = tok2vec.maybe_get_dim("nO") | ||||||
|  |         attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform) | ||||||
|  |         norm_layer = LayerNorm(nI=width) | ||||||
|  |         parametric_attention = ( | ||||||
|  |             tok2vec | ||||||
|  |             >> list2ragged() | ||||||
|  |             >> attention_layer | ||||||
|  |             >> reduce_sum() | ||||||
|  |             >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0)) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         parametric_attention.init = _init_parametric_attention_with_residual_nonlinear | ||||||
|  | 
 | ||||||
|  |         parametric_attention.set_ref("tok2vec", tok2vec) | ||||||
|  |         parametric_attention.set_ref("attention_layer", attention_layer) | ||||||
|  |         parametric_attention.set_ref("nonlinear_layer", nonlinear_layer) | ||||||
|  |         parametric_attention.set_ref("norm_layer", norm_layer) | ||||||
|  | 
 | ||||||
|  |         return parametric_attention | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: | ||||||
|  |     tok2vec_width = get_tok2vec_width(model) | ||||||
|  |     model.get_ref("attention_layer").set_dim("nO", tok2vec_width) | ||||||
|  |     model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) | ||||||
|  |     model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width) | ||||||
|  |     model.get_ref("norm_layer").set_dim("nI", tok2vec_width) | ||||||
|  |     model.get_ref("norm_layer").set_dim("nO", tok2vec_width) | ||||||
|  |     init_chain(model, X, Y) | ||||||
|  |     return model | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @registry.architectures("spacy.TextCatReduce.v1") | ||||||
|  | def build_reduce_text_classifier( | ||||||
|  |     tok2vec: Model, | ||||||
|  |     exclusive_classes: bool, | ||||||
|  |     use_reduce_first: bool, | ||||||
|  |     use_reduce_last: bool, | ||||||
|  |     use_reduce_max: bool, | ||||||
|  |     use_reduce_mean: bool, | ||||||
|  |     nO: Optional[int] = None, | ||||||
|  | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     """Build a model that classifies pooled `Doc` representations. | ||||||
|  | 
 | ||||||
|  |     Pooling is performed using reductions. Reductions are concatenated when | ||||||
|  |     multiple reductions are used. | ||||||
|  | 
 | ||||||
|  |     tok2vec (Model): the tok2vec layer to pool over. | ||||||
|  |     exclusive_classes (bool): Whether or not classes are mutually exclusive. | ||||||
|  |     use_reduce_first (bool): Pool by using the hidden representation of the | ||||||
|  |         first token of a `Doc`. | ||||||
|  |     use_reduce_last (bool): Pool by using the hidden representation of the | ||||||
|  |         last token of a `Doc`. | ||||||
|  |     use_reduce_max (bool): Pool by taking the maximum values of the hidden | ||||||
|  |         representations of a `Doc`. | ||||||
|  |     use_reduce_mean (bool): Pool by taking the mean of all hidden | ||||||
|  |         representations of a `Doc`. | ||||||
|  |     nO (Optional[int]): Number of classes. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     fill_defaults = {"b": 0, "W": 0} | ||||||
|  |     reductions = [] | ||||||
|  |     if use_reduce_first: | ||||||
|  |         reductions.append(reduce_first()) | ||||||
|  |     if use_reduce_last: | ||||||
|  |         reductions.append(reduce_last()) | ||||||
|  |     if use_reduce_max: | ||||||
|  |         reductions.append(reduce_max()) | ||||||
|  |     if use_reduce_mean: | ||||||
|  |         reductions.append(reduce_mean()) | ||||||
|  | 
 | ||||||
|  |     if not len(reductions): | ||||||
|  |         raise ValueError(Errors.E1057) | ||||||
|  | 
 | ||||||
|  |     with Model.define_operators({">>": chain}): | ||||||
|  |         cnn = tok2vec >> list2ragged() >> concatenate(*reductions) | ||||||
|  |         nO_tok2vec = tok2vec.maybe_get_dim("nO") | ||||||
|  |         nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None | ||||||
|  |         if exclusive_classes: | ||||||
|  |             output_layer = Softmax(nO=nO, nI=nI) | ||||||
|  |             fill_defaults["b"] = NEG_VALUE | ||||||
|  |             resizable_layer: Model = resizable( | ||||||
|  |                 output_layer, | ||||||
|  |                 resize_layer=partial( | ||||||
|  |                     resize_linear_weighted, fill_defaults=fill_defaults | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |             model = cnn >> resizable_layer | ||||||
|  |         else: | ||||||
|  |             output_layer = Linear(nO=nO, nI=nI) | ||||||
|  |             resizable_layer = resizable( | ||||||
|  |                 output_layer, | ||||||
|  |                 resize_layer=partial( | ||||||
|  |                     resize_linear_weighted, fill_defaults=fill_defaults | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |             model = cnn >> resizable_layer >> Logistic() | ||||||
|  |         model.set_ref("output_layer", output_layer) | ||||||
|  |         model.attrs["resize_output"] = partial( | ||||||
|  |             resize_and_set_ref, | ||||||
|  |             resizable_layer=resizable_layer, | ||||||
|  |         ) | ||||||
|  |     model.set_ref("tok2vec", tok2vec) | ||||||
|  |     if nO is not None: | ||||||
|  |         model.set_dim("nO", cast(int, nO)) | ||||||
|  |     model.attrs["multi_label"] = not exclusive_classes | ||||||
|  |     return model | ||||||
|  |  | ||||||
|  | @ -22,6 +22,7 @@ from .trainable_pipe import TrainablePipe | ||||||
| __all__ = [ | __all__ = [ | ||||||
|     "AttributeRuler", |     "AttributeRuler", | ||||||
|     "DependencyParser", |     "DependencyParser", | ||||||
|  |     "EditTreeLemmatizer", | ||||||
|     "EntityLinker", |     "EntityLinker", | ||||||
|     "EntityRecognizer", |     "EntityRecognizer", | ||||||
|     "EntityRuler", |     "EntityRuler", | ||||||
|  |  | ||||||
|  | @ -29,7 +29,7 @@ cdef class StateClass: | ||||||
|         return [self.B(i) for i in range(self.c.buffer_length())] |         return [self.B(i) for i in range(self.c.buffer_length())] | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def token_vector_lenth(self): |     def token_vector_length(self): | ||||||
|         return self.doc.tensor.shape[1] |         return self.doc.tensor.shape[1] | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  |  | ||||||
|  | @ -36,8 +36,9 @@ maxout_pieces = 3 | ||||||
| depth = 2 | depth = 2 | ||||||
| 
 | 
 | ||||||
| [model.linear_model] | [model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| """ | """ | ||||||
|  | @ -45,16 +46,21 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m | ||||||
| 
 | 
 | ||||||
| single_label_bow_config = """ | single_label_bow_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| single_label_cnn_config = """ | single_label_cnn_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TextCatCNN.v2" | @architectures = "spacy.TextCatReduce.v1" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | use_reduce_first = false | ||||||
|  | use_reduce_last = false | ||||||
|  | use_reduce_max = false | ||||||
|  | use_reduce_mean = true | ||||||
| 
 | 
 | ||||||
| [model.tok2vec] | [model.tok2vec] | ||||||
| @architectures = "spacy.HashEmbedCNN.v2" | @architectures = "spacy.HashEmbedCNN.v2" | ||||||
|  |  | ||||||
|  | @ -35,8 +35,9 @@ maxout_pieces = 3 | ||||||
| depth = 2 | depth = 2 | ||||||
| 
 | 
 | ||||||
| [model.linear_model] | [model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| """ | """ | ||||||
|  | @ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod | ||||||
| 
 | 
 | ||||||
| multi_label_bow_config = """ | multi_label_bow_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
|  | @ -52,8 +53,12 @@ no_output_layer = false | ||||||
| 
 | 
 | ||||||
| multi_label_cnn_config = """ | multi_label_cnn_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TextCatCNN.v2" | @architectures = "spacy.TextCatReduce.v1" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
|  | use_reduce_first = false | ||||||
|  | use_reduce_last = false | ||||||
|  | use_reduce_max = false | ||||||
|  | use_reduce_mean = true | ||||||
| 
 | 
 | ||||||
| [model.tok2vec] | [model.tok2vec] | ||||||
| @architectures = "spacy.HashEmbedCNN.v2" | @architectures = "spacy.HashEmbedCNN.v2" | ||||||
|  |  | ||||||
							
								
								
									
										138
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						|  | @ -802,6 +802,140 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # The following implementation of trapezoid() is adapted from SciPy, | ||||||
|  | # which is distributed under the New BSD License. | ||||||
|  | # Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers. | ||||||
|  | # See licenses/3rd_party_licenses.txt | ||||||
|  | def trapezoid(y, x=None, dx=1.0, axis=-1): | ||||||
|  |     r""" | ||||||
|  |     Integrate along the given axis using the composite trapezoidal rule. | ||||||
|  | 
 | ||||||
|  |     If `x` is provided, the integration happens in sequence along its | ||||||
|  |     elements - they are not sorted. | ||||||
|  | 
 | ||||||
|  |     Integrate `y` (`x`) along each 1d slice on the given axis, compute | ||||||
|  |     :math:`\int y(x) dx`. | ||||||
|  |     When `x` is specified, this integrates along the parametric curve, | ||||||
|  |     computing :math:`\int_t y(t) dt = | ||||||
|  |     \int_t y(t) \left.\frac{dx}{dt}\right|_{x=x(t)} dt`. | ||||||
|  | 
 | ||||||
|  |     Parameters | ||||||
|  |     ---------- | ||||||
|  |     y : array_like | ||||||
|  |         Input array to integrate. | ||||||
|  |     x : array_like, optional | ||||||
|  |         The sample points corresponding to the `y` values. If `x` is None, | ||||||
|  |         the sample points are assumed to be evenly spaced `dx` apart. The | ||||||
|  |         default is None. | ||||||
|  |     dx : scalar, optional | ||||||
|  |         The spacing between sample points when `x` is None. The default is 1. | ||||||
|  |     axis : int, optional | ||||||
|  |         The axis along which to integrate. | ||||||
|  | 
 | ||||||
|  |     Returns | ||||||
|  |     ------- | ||||||
|  |     trapezoid : float or ndarray | ||||||
|  |         Definite integral of `y` = n-dimensional array as approximated along | ||||||
|  |         a single axis by the trapezoidal rule. If `y` is a 1-dimensional array, | ||||||
|  |         then the result is a float. If `n` is greater than 1, then the result | ||||||
|  |         is an `n`-1 dimensional array. | ||||||
|  | 
 | ||||||
|  |     See Also | ||||||
|  |     -------- | ||||||
|  |     cumulative_trapezoid, simpson, romb | ||||||
|  | 
 | ||||||
|  |     Notes | ||||||
|  |     ----- | ||||||
|  |     Image [2]_ illustrates trapezoidal rule -- y-axis locations of points | ||||||
|  |     will be taken from `y` array, by default x-axis distances between | ||||||
|  |     points will be 1.0, alternatively they can be provided with `x` array | ||||||
|  |     or with `dx` scalar.  Return value will be equal to combined area under | ||||||
|  |     the red lines. | ||||||
|  | 
 | ||||||
|  |     References | ||||||
|  |     ---------- | ||||||
|  |     .. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule | ||||||
|  | 
 | ||||||
|  |     .. [2] Illustration image: | ||||||
|  |            https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png | ||||||
|  | 
 | ||||||
|  |     Examples | ||||||
|  |     -------- | ||||||
|  |     Use the trapezoidal rule on evenly spaced points: | ||||||
|  | 
 | ||||||
|  |     >>> import numpy as np | ||||||
|  |     >>> from scipy import integrate | ||||||
|  |     >>> integrate.trapezoid([1, 2, 3]) | ||||||
|  |     4.0 | ||||||
|  | 
 | ||||||
|  |     The spacing between sample points can be selected by either the | ||||||
|  |     ``x`` or ``dx`` arguments: | ||||||
|  | 
 | ||||||
|  |     >>> integrate.trapezoid([1, 2, 3], x=[4, 6, 8]) | ||||||
|  |     8.0 | ||||||
|  |     >>> integrate.trapezoid([1, 2, 3], dx=2) | ||||||
|  |     8.0 | ||||||
|  | 
 | ||||||
|  |     Using a decreasing ``x`` corresponds to integrating in reverse: | ||||||
|  | 
 | ||||||
|  |     >>> integrate.trapezoid([1, 2, 3], x=[8, 6, 4]) | ||||||
|  |     -8.0 | ||||||
|  | 
 | ||||||
|  |     More generally ``x`` is used to integrate along a parametric curve. We can | ||||||
|  |     estimate the integral :math:`\int_0^1 x^2 = 1/3` using: | ||||||
|  | 
 | ||||||
|  |     >>> x = np.linspace(0, 1, num=50) | ||||||
|  |     >>> y = x**2 | ||||||
|  |     >>> integrate.trapezoid(y, x) | ||||||
|  |     0.33340274885464394 | ||||||
|  | 
 | ||||||
|  |     Or estimate the area of a circle, noting we repeat the sample which closes | ||||||
|  |     the curve: | ||||||
|  | 
 | ||||||
|  |     >>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True) | ||||||
|  |     >>> integrate.trapezoid(np.cos(theta), x=np.sin(theta)) | ||||||
|  |     3.141571941375841 | ||||||
|  | 
 | ||||||
|  |     ``trapezoid`` can be applied along a specified axis to do multiple | ||||||
|  |     computations in one call: | ||||||
|  | 
 | ||||||
|  |     >>> a = np.arange(6).reshape(2, 3) | ||||||
|  |     >>> a | ||||||
|  |     array([[0, 1, 2], | ||||||
|  |            [3, 4, 5]]) | ||||||
|  |     >>> integrate.trapezoid(a, axis=0) | ||||||
|  |     array([1.5, 2.5, 3.5]) | ||||||
|  |     >>> integrate.trapezoid(a, axis=1) | ||||||
|  |     array([2.,  8.]) | ||||||
|  |     """ | ||||||
|  |     y = np.asanyarray(y) | ||||||
|  |     if x is None: | ||||||
|  |         d = dx | ||||||
|  |     else: | ||||||
|  |         x = np.asanyarray(x) | ||||||
|  |         if x.ndim == 1: | ||||||
|  |             d = np.diff(x) | ||||||
|  |             # reshape to correct shape | ||||||
|  |             shape = [1] * y.ndim | ||||||
|  |             shape[axis] = d.shape[0] | ||||||
|  |             d = d.reshape(shape) | ||||||
|  |         else: | ||||||
|  |             d = np.diff(x, axis=axis) | ||||||
|  |     nd = y.ndim | ||||||
|  |     slice1 = [slice(None)] * nd | ||||||
|  |     slice2 = [slice(None)] * nd | ||||||
|  |     slice1[axis] = slice(1, None) | ||||||
|  |     slice2[axis] = slice(None, -1) | ||||||
|  |     try: | ||||||
|  |         ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis) | ||||||
|  |     except ValueError: | ||||||
|  |         # Operations didn't work, cast to ndarray | ||||||
|  |         d = np.asarray(d) | ||||||
|  |         y = np.asarray(y) | ||||||
|  |         ret = np.add.reduce(d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0, axis) | ||||||
|  |     return ret | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # The following implementation of roc_auc_score() is adapted from | # The following implementation of roc_auc_score() is adapted from | ||||||
| # scikit-learn, which is distributed under the New BSD License. | # scikit-learn, which is distributed under the New BSD License. | ||||||
| # Copyright (c) 2007–2019 The scikit-learn developers. | # Copyright (c) 2007–2019 The scikit-learn developers. | ||||||
|  | @ -1024,9 +1158,9 @@ def _auc(x, y): | ||||||
|         else: |         else: | ||||||
|             raise ValueError(Errors.E164.format(x=x)) |             raise ValueError(Errors.E164.format(x=x)) | ||||||
| 
 | 
 | ||||||
|     area = direction * np.trapz(y, x) |     area = direction * trapezoid(y, x) | ||||||
|     if isinstance(area, np.memmap): |     if isinstance(area, np.memmap): | ||||||
|         # Reductions such as .sum used internally in np.trapz do not return a |         # Reductions such as .sum used internally in trapezoid do not return a | ||||||
|         # scalar by default for numpy.memmap instances contrary to |         # scalar by default for numpy.memmap instances contrary to | ||||||
|         # regular numpy.ndarray instances. |         # regular numpy.ndarray instances. | ||||||
|         area = area.dtype.type(area) |         area = area.dtype.type(area) | ||||||
|  |  | ||||||
|  | @ -162,6 +162,11 @@ def fi_tokenizer(): | ||||||
|     return get_lang_class("fi")().tokenizer |     return get_lang_class("fi")().tokenizer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def fo_tokenizer(): | ||||||
|  |     return get_lang_class("fo")().tokenizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def fr_tokenizer(): | def fr_tokenizer(): | ||||||
|     return get_lang_class("fr")().tokenizer |     return get_lang_class("fr")().tokenizer | ||||||
|  | @ -317,6 +322,11 @@ def nl_tokenizer(): | ||||||
|     return get_lang_class("nl")().tokenizer |     return get_lang_class("nl")().tokenizer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def nn_tokenizer(): | ||||||
|  |     return get_lang_class("nn")().tokenizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def pl_tokenizer(): | def pl_tokenizer(): | ||||||
|     return get_lang_class("pl")().tokenizer |     return get_lang_class("pl")().tokenizer | ||||||
|  |  | ||||||
|  | @ -731,3 +731,12 @@ def test_for_no_ent_sents(): | ||||||
|     sents = list(doc.ents[0].sents) |     sents = list(doc.ents[0].sents) | ||||||
|     assert len(sents) == 1 |     assert len(sents) == 1 | ||||||
|     assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" |     assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_span_api_richcmp_other(en_tokenizer): | ||||||
|  |     doc1 = en_tokenizer("a b") | ||||||
|  |     doc2 = en_tokenizer("b c") | ||||||
|  |     assert not doc1[1:2] == doc1[1] | ||||||
|  |     assert not doc1[1:2] == doc2[0] | ||||||
|  |     assert not doc1[1:2] == doc2[0:1] | ||||||
|  |     assert not doc1[0:1] == doc2 | ||||||
|  |  | ||||||
|  | @ -294,3 +294,12 @@ def test_missing_head_dep(en_vocab): | ||||||
|     assert aligned_heads[0] == ref_heads[0] |     assert aligned_heads[0] == ref_heads[0] | ||||||
|     assert aligned_deps[5] == ref_deps[5] |     assert aligned_deps[5] == ref_deps[5] | ||||||
|     assert aligned_heads[5] == ref_heads[5] |     assert aligned_heads[5] == ref_heads[5] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_token_api_richcmp_other(en_tokenizer): | ||||||
|  |     doc1 = en_tokenizer("a b") | ||||||
|  |     doc2 = en_tokenizer("b c") | ||||||
|  |     assert not doc1[1] == doc1[0:1] | ||||||
|  |     assert not doc1[1] == doc2[1:2] | ||||||
|  |     assert not doc1[1] == doc2[0] | ||||||
|  |     assert not doc1[0] == doc2 | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/fo/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										26
									
								
								spacy/tests/lang/fo/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,26 @@ | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | # examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/) | ||||||
|  | # fmt: off | ||||||
|  | FO_TOKEN_EXCEPTION_TESTS = [ | ||||||
|  |     ( | ||||||
|  |         "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", | ||||||
|  |         [ | ||||||
|  |             "Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".", | ||||||
|  |         ], | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.", | ||||||
|  |         [ | ||||||
|  |             "Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".", | ||||||
|  |         ], | ||||||
|  |     ), | ||||||
|  | ] | ||||||
|  | # fmt: on | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS) | ||||||
|  | def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens): | ||||||
|  |     tokens = fo_tokenizer(text) | ||||||
|  |     token_list = [token.text for token in tokens if not token.is_space] | ||||||
|  |     assert expected_tokens == token_list | ||||||
							
								
								
									
										0
									
								
								spacy/tests/lang/nn/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										38
									
								
								spacy/tests/lang/nn/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,38 @@ | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | # examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) | ||||||
|  | # fmt: off | ||||||
|  | NN_TOKEN_EXCEPTION_TESTS = [ | ||||||
|  |     ( | ||||||
|  |         "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", | ||||||
|  |         [ | ||||||
|  |             "Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".", | ||||||
|  |         ], | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.", | ||||||
|  |         [ | ||||||
|  |             "Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".", | ||||||
|  |         ], | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.", | ||||||
|  |         [ | ||||||
|  |             "Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".", | ||||||
|  |         ], | ||||||
|  |     ), | ||||||
|  |     ( | ||||||
|  |         "Brukssesongen er frå nov. til mai, med ein topp i mars.", | ||||||
|  |         [ | ||||||
|  |             "Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".", | ||||||
|  |         ], | ||||||
|  |     ), | ||||||
|  | ] | ||||||
|  | # fmt: on | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS) | ||||||
|  | def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens): | ||||||
|  |     tokens = nn_tokenizer(text) | ||||||
|  |     token_list = [token.text for token in tokens if not token.is_space] | ||||||
|  |     assert expected_tokens == token_list | ||||||
|  | @ -203,7 +203,7 @@ def test_pipe_class_component_model(): | ||||||
|             "@architectures": "spacy.TextCatEnsemble.v2", |             "@architectures": "spacy.TextCatEnsemble.v2", | ||||||
|             "tok2vec": DEFAULT_TOK2VEC_MODEL, |             "tok2vec": DEFAULT_TOK2VEC_MODEL, | ||||||
|             "linear_model": { |             "linear_model": { | ||||||
|                 "@architectures": "spacy.TextCatBOW.v2", |                 "@architectures": "spacy.TextCatBOW.v3", | ||||||
|                 "exclusive_classes": False, |                 "exclusive_classes": False, | ||||||
|                 "ngram_size": 1, |                 "ngram_size": 1, | ||||||
|                 "no_output_layer": False, |                 "no_output_layer": False, | ||||||
|  |  | ||||||
|  | @ -414,7 +414,7 @@ def test_implicit_label(name, get_examples): | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     "name,textcat_config", |     "name,textcat_config", | ||||||
|     [ |     [ | ||||||
|         # BOW |         # BOW V1 | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), |         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), |         ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||||
|  | @ -451,14 +451,14 @@ def test_no_resize(name, textcat_config): | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     "name,textcat_config", |     "name,textcat_config", | ||||||
|     [ |     [ | ||||||
|         # BOW |         # BOW V3 | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), |         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), |         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||||
|         # CNN |         # CNN | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), |         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| # fmt: on | # fmt: on | ||||||
|  | @ -480,14 +480,14 @@ def test_resize(name, textcat_config): | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     "name,textcat_config", |     "name,textcat_config", | ||||||
|     [ |     [ | ||||||
|         # BOW |         # BOW v3 | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), |         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), |         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), | ||||||
|         # CNN |         # REDUCE | ||||||
|         ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), |         ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||||
|         ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), |         ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| # fmt: on | # fmt: on | ||||||
|  | @ -693,12 +693,23 @@ def test_overfitting_IO_multi(): | ||||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), | ||||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), | ||||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), | ||||||
|  |         # BOW V3 | ||||||
|  |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}), | ||||||
|  |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), | ||||||
|  |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), | ||||||
|  |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), | ||||||
|         # ENSEMBLE V2 |         # ENSEMBLE V2 | ||||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), | ||||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), | ||||||
|         # CNN V2 |         # CNN V2 (legacy) | ||||||
|         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||||
|         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||||
|  |         # PARAMETRIC ATTENTION V1 | ||||||
|  |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), | ||||||
|  |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), | ||||||
|  |         # REDUCE V1 | ||||||
|  |         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||||
|  |         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| # fmt: on | # fmt: on | ||||||
|  |  | ||||||
|  | @ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer): | ||||||
|     Token.set_extension("_test_token", default="t0") |     Token.set_extension("_test_token", default="t0") | ||||||
|     doc[1]._._test_token = "t1" |     doc[1]._._test_token = "t1" | ||||||
| 
 | 
 | ||||||
|     return doc |     yield doc | ||||||
|  | 
 | ||||||
|  |     Doc.remove_extension("_test_attr") | ||||||
|  |     Doc.remove_extension("_test_prop") | ||||||
|  |     Doc.remove_extension("_test_method") | ||||||
|  |     Token.remove_extension("_test_token") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_serialize_ext_attrs_from_bytes(doc_w_attrs): | def test_serialize_ext_attrs_from_bytes(doc_w_attrs): | ||||||
|  |  | ||||||
|  | @ -1061,3 +1061,8 @@ def test_debug_data_trainable_lemmatizer_not_annotated(): | ||||||
| 
 | 
 | ||||||
|     data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) |     data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) | ||||||
|     assert data["no_lemma_annotations"] == 2 |     assert data["no_lemma_annotations"] == 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_project_api_imports(): | ||||||
|  |     from spacy.cli import project_run | ||||||
|  |     from spacy.cli.project.run import project_run  # noqa: F401, F811 | ||||||
|  |  | ||||||
|  | @ -214,9 +214,6 @@ def test_project_clone(options): | ||||||
|         assert (out / "README.md").is_file() |         assert (out / "README.md").is_file() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.skipif( |  | ||||||
|     sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes" |  | ||||||
| ) |  | ||||||
| def test_project_push_pull(project_dir): | def test_project_push_pull(project_dir): | ||||||
|     proj = dict(SAMPLE_PROJECT) |     proj = dict(SAMPLE_PROJECT) | ||||||
|     remote = "xyz" |     remote = "xyz" | ||||||
|  | @ -241,7 +238,7 @@ def test_project_push_pull(project_dir): | ||||||
| 
 | 
 | ||||||
| def test_find_function_valid(): | def test_find_function_valid(): | ||||||
|     # example of architecture in main code base |     # example of architecture in main code base | ||||||
|     function = "spacy.TextCatBOW.v2" |     function = "spacy.TextCatBOW.v3" | ||||||
|     result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) |     result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) | ||||||
|     assert f"Found registered function '{function}'" in result.stdout |     assert f"Found registered function '{function}'" in result.stdout | ||||||
|     assert "textcat.py" in result.stdout |     assert "textcat.py" in result.stdout | ||||||
|  | @ -260,7 +257,7 @@ def test_find_function_valid(): | ||||||
| 
 | 
 | ||||||
| def test_find_function_invalid(): | def test_find_function_invalid(): | ||||||
|     # invalid registry |     # invalid registry | ||||||
|     function = "spacy.TextCatBOW.v2" |     function = "spacy.TextCatBOW.v3" | ||||||
|     registry = "foobar" |     registry = "foobar" | ||||||
|     result = CliRunner().invoke( |     result = CliRunner().invoke( | ||||||
|         app, ["find-function", function, "--registry", registry] |         app, ["find-function", function, "--registry", registry] | ||||||
|  |  | ||||||
|  | @ -2,7 +2,7 @@ import numpy | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| from spacy import displacy | from spacy import displacy | ||||||
| from spacy.displacy.render import DependencyRenderer, EntityRenderer | from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.lang.fa import Persian | from spacy.lang.fa import Persian | ||||||
| from spacy.tokens import Doc, Span | from spacy.tokens import Doc, Span | ||||||
|  | @ -468,3 +468,23 @@ def test_issue12816(en_vocab) -> None: | ||||||
|     # Verify that the HTML tag is still escaped |     # Verify that the HTML tag is still escaped | ||||||
|     html = displacy.render(doc, style="span") |     html = displacy.render(doc, style="span") | ||||||
|     assert "<TEST>" in html |     assert "<TEST>" in html | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.issue(13056) | ||||||
|  | def test_displacy_span_stacking(): | ||||||
|  |     """Test whether span stacking works properly for multiple overlapping spans.""" | ||||||
|  |     spans = [ | ||||||
|  |         {"start_token": 2, "end_token": 5, "label": "SkillNC"}, | ||||||
|  |         {"start_token": 0, "end_token": 2, "label": "Skill"}, | ||||||
|  |         {"start_token": 1, "end_token": 3, "label": "Skill"}, | ||||||
|  |     ] | ||||||
|  |     tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."] | ||||||
|  |     per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens) | ||||||
|  | 
 | ||||||
|  |     assert len(per_token_info) == len(tokens) | ||||||
|  |     assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)]) | ||||||
|  |     assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)]) | ||||||
|  |     assert per_token_info[1]["entities"][0]["render_slot"] == 1 | ||||||
|  |     assert per_token_info[1]["entities"][1]["render_slot"] == 2 | ||||||
|  |     assert per_token_info[2]["entities"][0]["render_slot"] == 2 | ||||||
|  |     assert per_token_info[2]["entities"][1]["render_slot"] == 3 | ||||||
|  |  | ||||||
|  | @ -376,8 +376,9 @@ def test_util_dot_section(): | ||||||
|     factory = "textcat" |     factory = "textcat" | ||||||
| 
 | 
 | ||||||
|     [components.textcat.model] |     [components.textcat.model] | ||||||
|     @architectures = "spacy.TextCatBOW.v2" |     @architectures = "spacy.TextCatBOW.v3" | ||||||
|     exclusive_classes = true |     exclusive_classes = true | ||||||
|  |     length = 262144 | ||||||
|     ngram_size = 1 |     ngram_size = 1 | ||||||
|     no_output_layer = false |     no_output_layer = false | ||||||
|     """ |     """ | ||||||
|  | @ -485,8 +486,8 @@ def test_to_ternary_int(): | ||||||
| 
 | 
 | ||||||
| def test_find_available_port(): | def test_find_available_port(): | ||||||
|     host = "0.0.0.0" |     host = "0.0.0.0" | ||||||
|     port = 5000 |     port = 5001 | ||||||
|     assert find_available_port(port, host) == port, "Port 5000 isn't free" |     assert find_available_port(port, host) == port, "Port 5001 isn't free" | ||||||
| 
 | 
 | ||||||
|     from wsgiref.simple_server import demo_app, make_server |     from wsgiref.simple_server import demo_app, make_server | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -26,6 +26,7 @@ from spacy.ml.models import ( | ||||||
|     build_Tok2Vec_model, |     build_Tok2Vec_model, | ||||||
| ) | ) | ||||||
| from spacy.ml.staticvectors import StaticVectors | from spacy.ml.staticvectors import StaticVectors | ||||||
|  | from spacy.util import registry | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_textcat_bow_kwargs(): | def get_textcat_bow_kwargs(): | ||||||
|  | @ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5): | ||||||
|     Y, backprop = model((docs, spans), is_train=True) |     Y, backprop = model((docs, spans), is_train=True) | ||||||
|     assert Y.shape == (spans.dataXd.shape[0], nO) |     assert Y.shape == (spans.dataXd.shape[0], nO) | ||||||
|     backprop(Y) |     backprop(Y) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_textcat_reduce_invalid_args(): | ||||||
|  |     textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1") | ||||||
|  |     tok2vec = make_test_tok2vec() | ||||||
|  |     with pytest.raises(ValueError, match=r"must be used with at least one reduction"): | ||||||
|  |         textcat_reduce( | ||||||
|  |             tok2vec=tok2vec, | ||||||
|  |             exclusive_classes=False, | ||||||
|  |             use_reduce_first=False, | ||||||
|  |             use_reduce_last=False, | ||||||
|  |             use_reduce_max=False, | ||||||
|  |             use_reduce_mean=False, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  | @ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab): | ||||||
|     assert tokens == explain_tokens |     assert tokens == explain_tokens | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_tokenizer_explain_special_matcher_whitespace(en_vocab): | ||||||
|  |     rules = {":]": [{"ORTH": ":]"}]} | ||||||
|  |     tokenizer = Tokenizer( | ||||||
|  |         en_vocab, | ||||||
|  |         rules=rules, | ||||||
|  |     ) | ||||||
|  |     text = ": ]" | ||||||
|  |     tokens = [t.text for t in tokenizer(text)] | ||||||
|  |     explain_tokens = [t[1] for t in tokenizer.explain(text)] | ||||||
|  |     assert tokens == explain_tokens | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @hypothesis.strategies.composite | @hypothesis.strategies.composite | ||||||
| def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: | def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: | ||||||
|     """ |     """ | ||||||
|  | @ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None: | ||||||
|     """ |     """ | ||||||
| 
 | 
 | ||||||
|     tokenizer: Tokenizer = spacy.blank(lang).tokenizer |     tokenizer: Tokenizer = spacy.blank(lang).tokenizer | ||||||
|     tokens = [t.text for t in tokenizer(sentence) if not t.is_space] |     # Tokenizer.explain is not intended to handle whitespace or control | ||||||
|  |     # characters in the same way as Tokenizer | ||||||
|  |     sentence = re.sub(r"\s+", " ", sentence).strip() | ||||||
|  |     tokens = [t.text for t in tokenizer(sentence)] | ||||||
|     debug_tokens = [t[1] for t in tokenizer.explain(sentence)] |     debug_tokens = [t[1] for t in tokenizer.explain(sentence)] | ||||||
|     assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" |     assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" | ||||||
|  |  | ||||||
|  | @ -730,9 +730,16 @@ cdef class Tokenizer: | ||||||
|             if i in spans_by_start: |             if i in spans_by_start: | ||||||
|                 span = spans_by_start[i] |                 span = spans_by_start[i] | ||||||
|                 exc = [d[ORTH] for d in special_cases[span.label_]] |                 exc = [d[ORTH] for d in special_cases[span.label_]] | ||||||
|                 for j, orth in enumerate(exc): |                 # The phrase matcher can overmatch for tokens separated by | ||||||
|                     final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) |                 # spaces in the text but not in the underlying rule, so skip | ||||||
|                 i += len(span) |                 # cases where the texts aren't identical | ||||||
|  |                 if span.text != "".join([self.vocab.strings[orth] for orth in exc]): | ||||||
|  |                     final_tokens.append(tokens[i]) | ||||||
|  |                     i += 1 | ||||||
|  |                 else: | ||||||
|  |                     for j, orth in enumerate(exc): | ||||||
|  |                         final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) | ||||||
|  |                     i += len(span) | ||||||
|             else: |             else: | ||||||
|                 final_tokens.append(tokens[i]) |                 final_tokens.append(tokens[i]) | ||||||
|                 i += 1 |                 i += 1 | ||||||
|  |  | ||||||
|  | @ -5,4 +5,4 @@ from .span import Span | ||||||
| from .span_group import SpanGroup | from .span_group import SpanGroup | ||||||
| from .token import Token | from .token import Token | ||||||
| 
 | 
 | ||||||
| __all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"] | __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"] | ||||||
|  |  | ||||||
|  | @ -42,7 +42,7 @@ class Doc: | ||||||
|     user_hooks: Dict[str, Callable[..., Any]] |     user_hooks: Dict[str, Callable[..., Any]] | ||||||
|     user_token_hooks: Dict[str, Callable[..., Any]] |     user_token_hooks: Dict[str, Callable[..., Any]] | ||||||
|     user_span_hooks: Dict[str, Callable[..., Any]] |     user_span_hooks: Dict[str, Callable[..., Any]] | ||||||
|     tensor: np.ndarray[Any, np.dtype[np.float_]] |     tensor: np.ndarray[Any, np.dtype[np.float64]] | ||||||
|     user_data: Dict[str, Any] |     user_data: Dict[str, Any] | ||||||
|     has_unknown_spaces: bool |     has_unknown_spaces: bool | ||||||
|     _context: Any |     _context: Any | ||||||
|  | @ -125,7 +125,7 @@ class Doc: | ||||||
|         vector: Optional[Floats1d] = ..., |         vector: Optional[Floats1d] = ..., | ||||||
|         alignment_mode: str = ..., |         alignment_mode: str = ..., | ||||||
|         span_id: Union[int, str] = ..., |         span_id: Union[int, str] = ..., | ||||||
|     ) -> Span: ... |     ) -> Optional[Span]: ... | ||||||
|     def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... |     def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... | ||||||
|     @property |     @property | ||||||
|     def has_vector(self) -> bool: ... |     def has_vector(self) -> bool: ... | ||||||
|  | @ -166,7 +166,7 @@ class Doc: | ||||||
|     ) -> Doc: ... |     ) -> Doc: ... | ||||||
|     def to_array( |     def to_array( | ||||||
|         self, py_attr_ids: Union[int, str, List[Union[int, str]]] |         self, py_attr_ids: Union[int, str, List[Union[int, str]]] | ||||||
|     ) -> np.ndarray[Any, np.dtype[np.float_]]: ... |     ) -> np.ndarray[Any, np.dtype[np.float64]]: ... | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def from_docs( |     def from_docs( | ||||||
|         docs: List[Doc], |         docs: List[Doc], | ||||||
|  | @ -179,15 +179,13 @@ class Doc: | ||||||
|         self, path: Union[str, Path], *, exclude: Iterable[str] = ... |         self, path: Union[str, Path], *, exclude: Iterable[str] = ... | ||||||
|     ) -> None: ... |     ) -> None: ... | ||||||
|     def from_disk( |     def from_disk( | ||||||
|         self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ... |         self, path: Union[str, Path], *, exclude: Iterable[str] = ... | ||||||
|     ) -> Doc: ... |     ) -> Doc: ... | ||||||
|     def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... |     def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ... | ||||||
|     def from_bytes( |     def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ... | ||||||
|         self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ... |     def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ... | ||||||
|     ) -> Doc: ... |  | ||||||
|     def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... |  | ||||||
|     def from_dict( |     def from_dict( | ||||||
|         self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ... |         self, msg: Dict[str, Any], *, exclude: Iterable[str] = ... | ||||||
|     ) -> Doc: ... |     ) -> Doc: ... | ||||||
|     def extend_tensor(self, tensor: Floats2d) -> None: ... |     def extend_tensor(self, tensor: Floats2d) -> None: ... | ||||||
|     def retokenize(self) -> Retokenizer: ... |     def retokenize(self) -> Retokenizer: ... | ||||||
|  |  | ||||||
|  | @ -1326,7 +1326,7 @@ cdef class Doc: | ||||||
| 
 | 
 | ||||||
|         path (str / Path): A path to a directory. Paths may be either |         path (str / Path): A path to a directory. Paths may be either | ||||||
|             strings or `Path`-like objects. |             strings or `Path`-like objects. | ||||||
|         exclude (list): String names of serialization fields to exclude. |         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||||
|         RETURNS (Doc): The modified `Doc` object. |         RETURNS (Doc): The modified `Doc` object. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/doc#from_disk |         DOCS: https://spacy.io/api/doc#from_disk | ||||||
|  | @ -1339,7 +1339,7 @@ cdef class Doc: | ||||||
|     def to_bytes(self, *, exclude=tuple()): |     def to_bytes(self, *, exclude=tuple()): | ||||||
|         """Serialize, i.e. export the document contents to a binary string. |         """Serialize, i.e. export the document contents to a binary string. | ||||||
| 
 | 
 | ||||||
|         exclude (list): String names of serialization fields to exclude. |         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||||
|         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including |         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including | ||||||
|             all annotations. |             all annotations. | ||||||
| 
 | 
 | ||||||
|  | @ -1351,7 +1351,7 @@ cdef class Doc: | ||||||
|         """Deserialize, i.e. import the document contents from a binary string. |         """Deserialize, i.e. import the document contents from a binary string. | ||||||
| 
 | 
 | ||||||
|         data (bytes): The string to load from. |         data (bytes): The string to load from. | ||||||
|         exclude (list): String names of serialization fields to exclude. |         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||||
|         RETURNS (Doc): Itself. |         RETURNS (Doc): Itself. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://spacy.io/api/doc#from_bytes |         DOCS: https://spacy.io/api/doc#from_bytes | ||||||
|  | @ -1361,11 +1361,8 @@ cdef class Doc: | ||||||
|     def to_dict(self, *, exclude=tuple()): |     def to_dict(self, *, exclude=tuple()): | ||||||
|         """Export the document contents to a dictionary for serialization. |         """Export the document contents to a dictionary for serialization. | ||||||
| 
 | 
 | ||||||
|         exclude (list): String names of serialization fields to exclude. |         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||||
|         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including |         RETURNS (Dict[str, Any]): A dictionary representation of the `Doc` | ||||||
|             all annotations. |  | ||||||
| 
 |  | ||||||
|         DOCS: https://spacy.io/api/doc#to_bytes |  | ||||||
|         """ |         """ | ||||||
|         array_head = Doc._get_array_attrs() |         array_head = Doc._get_array_attrs() | ||||||
|         strings = set() |         strings = set() | ||||||
|  | @ -1411,13 +1408,11 @@ cdef class Doc: | ||||||
|         return util.to_dict(serializers, exclude) |         return util.to_dict(serializers, exclude) | ||||||
| 
 | 
 | ||||||
|     def from_dict(self, msg, *, exclude=tuple()): |     def from_dict(self, msg, *, exclude=tuple()): | ||||||
|         """Deserialize, i.e. import the document contents from a binary string. |         """Deserialize the document contents from a dictionary representation. | ||||||
| 
 | 
 | ||||||
|         data (bytes): The string to load from. |         msg (Dict[str, Any]): The dictionary to load from. | ||||||
|         exclude (list): String names of serialization fields to exclude. |         exclude (Iterable[str]): String names of serialization fields to exclude. | ||||||
|         RETURNS (Doc): Itself. |         RETURNS (Doc): Itself. | ||||||
| 
 |  | ||||||
|         DOCS: https://spacy.io/api/doc#from_dict |  | ||||||
|         """ |         """ | ||||||
|         if self.length != 0: |         if self.length != 0: | ||||||
|             raise ValueError(Errors.E033.format(length=self.length)) |             raise ValueError(Errors.E033.format(length=self.length)) | ||||||
|  |  | ||||||
|  | @ -127,14 +127,17 @@ cdef class Span: | ||||||
|         self._vector = vector |         self._vector = vector | ||||||
|         self._vector_norm = vector_norm |         self._vector_norm = vector_norm | ||||||
| 
 | 
 | ||||||
|     def __richcmp__(self, Span other, int op): |     def __richcmp__(self, object other, int op): | ||||||
|         if other is None: |         if other is None: | ||||||
|             if op == 0 or op == 1 or op == 2: |             if op == 0 or op == 1 or op == 2: | ||||||
|                 return False |                 return False | ||||||
|             else: |             else: | ||||||
|                 return True |                 return True | ||||||
|  |         if not isinstance(other, Span): | ||||||
|  |             return False | ||||||
|  |         cdef Span other_span = other | ||||||
|         self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc) |         self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc) | ||||||
|         other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc) |         other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc) | ||||||
|         # < |         # < | ||||||
|         if op == 0: |         if op == 0: | ||||||
|             return self_tuple < other_tuple |             return self_tuple < other_tuple | ||||||
|  |  | ||||||
|  | @ -53,7 +53,12 @@ class Token: | ||||||
|     def __bytes__(self) -> bytes: ... |     def __bytes__(self) -> bytes: ... | ||||||
|     def __str__(self) -> str: ... |     def __str__(self) -> str: ... | ||||||
|     def __repr__(self) -> str: ... |     def __repr__(self) -> str: ... | ||||||
|     def __richcmp__(self, other: Token, op: int) -> bool: ... |     def __lt__(self, other: Any) -> bool: ... | ||||||
|  |     def __le__(self, other: Any) -> bool: ... | ||||||
|  |     def __eq__(self, other: Any) -> bool: ... | ||||||
|  |     def __ne__(self, other: Any) -> bool: ... | ||||||
|  |     def __gt__(self, other: Any) -> bool: ... | ||||||
|  |     def __ge__(self, other: Any) -> bool: ... | ||||||
|     @property |     @property | ||||||
|     def _(self) -> Underscore: ... |     def _(self) -> Underscore: ... | ||||||
|     def nbor(self, i: int = ...) -> Token: ... |     def nbor(self, i: int = ...) -> Token: ... | ||||||
|  |  | ||||||
|  | @ -139,17 +139,20 @@ cdef class Token: | ||||||
|     def __repr__(self): |     def __repr__(self): | ||||||
|         return self.__str__() |         return self.__str__() | ||||||
| 
 | 
 | ||||||
|     def __richcmp__(self, Token other, int op): |     def __richcmp__(self, object other, int op): | ||||||
|         # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html |         # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html | ||||||
|         if other is None: |         if other is None: | ||||||
|             if op in (0, 1, 2): |             if op in (0, 1, 2): | ||||||
|                 return False |                 return False | ||||||
|             else: |             else: | ||||||
|                 return True |                 return True | ||||||
|  |         if not isinstance(other, Token): | ||||||
|  |             return False | ||||||
|  |         cdef Token other_token = other | ||||||
|         cdef Doc my_doc = self.doc |         cdef Doc my_doc = self.doc | ||||||
|         cdef Doc other_doc = other.doc |         cdef Doc other_doc = other_token.doc | ||||||
|         my = self.idx |         my = self.idx | ||||||
|         their = other.idx |         their = other_token.idx | ||||||
|         if op == 0: |         if op == 0: | ||||||
|             return my < their |             return my < their | ||||||
|         elif op == 2: |         elif op == 2: | ||||||
|  |  | ||||||
|  | @ -16,3 +16,28 @@ from .iob_utils import (  # noqa: F401 | ||||||
|     tags_to_entities, |     tags_to_entities, | ||||||
| ) | ) | ||||||
| from .loggers import console_logger  # noqa: F401 | from .loggers import console_logger  # noqa: F401 | ||||||
|  | 
 | ||||||
|  | __all__ = [ | ||||||
|  |     "Alignment", | ||||||
|  |     "Corpus", | ||||||
|  |     "Example", | ||||||
|  |     "JsonlCorpus", | ||||||
|  |     "PlainTextCorpus", | ||||||
|  |     "biluo_tags_to_offsets", | ||||||
|  |     "biluo_tags_to_spans", | ||||||
|  |     "biluo_to_iob", | ||||||
|  |     "create_copy_from_base_model", | ||||||
|  |     "docs_to_json", | ||||||
|  |     "dont_augment", | ||||||
|  |     "iob_to_biluo", | ||||||
|  |     "minibatch_by_padded_size", | ||||||
|  |     "minibatch_by_words", | ||||||
|  |     "offsets_to_biluo_tags", | ||||||
|  |     "orth_variants_augmenter", | ||||||
|  |     "read_json_file", | ||||||
|  |     "remove_bilu_prefix", | ||||||
|  |     "split_bilu_label", | ||||||
|  |     "tags_to_entities", | ||||||
|  |     "validate_get_examples", | ||||||
|  |     "validate_examples", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | @ -1077,20 +1077,38 @@ def make_tempdir() -> Generator[Path, None, None]: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_in_jupyter() -> bool: | def is_in_jupyter() -> bool: | ||||||
|     """Check if user is running spaCy from a Jupyter notebook by detecting the |     """Check if user is running spaCy from a Jupyter or Colab notebook by | ||||||
|     IPython kernel. Mainly used for the displaCy visualizer. |     detecting the IPython kernel. Mainly used for the displaCy visualizer. | ||||||
|     RETURNS (bool): True if in Jupyter, False if not. |     RETURNS (bool): True if in Jupyter/Colab, False if not. | ||||||
|     """ |     """ | ||||||
|     # https://stackoverflow.com/a/39662359/6400719 |     # https://stackoverflow.com/a/39662359/6400719 | ||||||
|  |     # https://stackoverflow.com/questions/15411967 | ||||||
|     try: |     try: | ||||||
|         shell = get_ipython().__class__.__name__  # type: ignore[name-defined] |         if get_ipython().__class__.__name__ == "ZMQInteractiveShell":  # type: ignore[name-defined] | ||||||
|         if shell == "ZMQInteractiveShell": |  | ||||||
|             return True  # Jupyter notebook or qtconsole |             return True  # Jupyter notebook or qtconsole | ||||||
|  |         if get_ipython().__class__.__module__ == "google.colab._shell":  # type: ignore[name-defined] | ||||||
|  |             return True  # Colab notebook | ||||||
|     except NameError: |     except NameError: | ||||||
|         return False  # Probably standard Python interpreter |         pass  # Probably standard Python interpreter | ||||||
|  |     # additional check for Colab | ||||||
|  |     try: | ||||||
|  |         import google.colab | ||||||
|  | 
 | ||||||
|  |         return True  # Colab notebook | ||||||
|  |     except ImportError: | ||||||
|  |         pass | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def is_in_interactive() -> bool: | ||||||
|  |     """Check if user is running spaCy from an interactive Python | ||||||
|  |     shell. Will return True in Jupyter notebooks too. | ||||||
|  |     RETURNS (bool): True if in interactive mode, False if not. | ||||||
|  |     """ | ||||||
|  |     # https://stackoverflow.com/questions/2356399/tell-if-python-is-in-interactive-mode | ||||||
|  |     return hasattr(sys, "ps1") or hasattr(sys, "ps2") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_object_name(obj: Any) -> str: | def get_object_name(obj: Any) -> str: | ||||||
|     """Get a human-readable name of a Python object, e.g. a pipeline component. |     """Get a human-readable name of a Python object, e.g. a pipeline component. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -78,16 +78,16 @@ subword features, and a | ||||||
| [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer | [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer | ||||||
| consisting of a CNN and a layer-normalized maxout activation function. | consisting of a CNN and a layer-normalized maxout activation function. | ||||||
| 
 | 
 | ||||||
| | Name                 | Description                                                                                                                                                                                                                                                                   | | | Name                 | Description                                                                                                                                                                                                                                                                 | | ||||||
| | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                          | | | `width`              | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~                                                                                                        | | ||||||
| | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                                | | | `depth`              | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~                                                                                                                                                                              | | ||||||
| | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                            | | | `embed_size`         | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~                                                                                          | | ||||||
| | `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | | `window_size`        | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | ||||||
| | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                   | | | `maxout_pieces`      | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~                                                                                 | | ||||||
| | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                       | | | `subword_features`   | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~                                     | | ||||||
| | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                  | | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~                                                                                                                                                                                                                                | | ||||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                        | | | **CREATES**          | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                      | | ||||||
| 
 | 
 | ||||||
| ### spacy.Tok2VecListener.v1 {id="Tok2VecListener"} | ### spacy.Tok2VecListener.v1 {id="Tok2VecListener"} | ||||||
| 
 | 
 | ||||||
|  | @ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the | ||||||
| > nO = null | > nO = null | ||||||
| > | > | ||||||
| > [model.linear_model] | > [model.linear_model] | ||||||
| > @architectures = "spacy.TextCatBOW.v2" | > @architectures = "spacy.TextCatBOW.v3" | ||||||
| > exclusive_classes = true | > exclusive_classes = true | ||||||
|  | > length = 262144 | ||||||
| > ngram_size = 1 | > ngram_size = 1 | ||||||
| > no_output_layer = false | > no_output_layer = false | ||||||
| > | > | ||||||
|  | @ -1017,54 +1018,15 @@ but used an internal `tok2vec` instead of taking it as argument: | ||||||
| 
 | 
 | ||||||
| </Accordion> | </Accordion> | ||||||
| 
 | 
 | ||||||
| ### spacy.TextCatCNN.v2 {id="TextCatCNN"} | ### spacy.TextCatBOW.v3 {id="TextCatBOW"} | ||||||
| 
 | 
 | ||||||
| > #### Example Config | > #### Example Config | ||||||
| > | > | ||||||
| > ```ini | > ```ini | ||||||
| > [model] | > [model] | ||||||
| > @architectures = "spacy.TextCatCNN.v2" | > @architectures = "spacy.TextCatBOW.v3" | ||||||
| > exclusive_classes = false |  | ||||||
| > nO = null |  | ||||||
| > |  | ||||||
| > [model.tok2vec] |  | ||||||
| > @architectures = "spacy.HashEmbedCNN.v2" |  | ||||||
| > pretrained_vectors = null |  | ||||||
| > width = 96 |  | ||||||
| > depth = 4 |  | ||||||
| > embed_size = 2000 |  | ||||||
| > window_size = 1 |  | ||||||
| > maxout_pieces = 3 |  | ||||||
| > subword_features = true |  | ||||||
| > ``` |  | ||||||
| 
 |  | ||||||
| A neural network model where token vectors are calculated using a CNN. The |  | ||||||
| vectors are mean pooled and used as features in a feed-forward network. This |  | ||||||
| architecture is usually less accurate than the ensemble, but runs faster. |  | ||||||
| 
 |  | ||||||
| | Name                | Description                                                                                                                                                                                    | |  | ||||||
| | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |  | ||||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | |  | ||||||
| | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        | |  | ||||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | |  | ||||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | |  | ||||||
| 
 |  | ||||||
| <Accordion title="spacy.TextCatCNN.v1 definition" spaced> |  | ||||||
| 
 |  | ||||||
| [TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was |  | ||||||
| not yet resizable. Since v2, new labels can be added to this component, even |  | ||||||
| after training. |  | ||||||
| 
 |  | ||||||
| </Accordion> |  | ||||||
| 
 |  | ||||||
| ### spacy.TextCatBOW.v2 {id="TextCatBOW"} |  | ||||||
| 
 |  | ||||||
| > #### Example Config |  | ||||||
| > |  | ||||||
| > ```ini |  | ||||||
| > [model] |  | ||||||
| > @architectures = "spacy.TextCatBOW.v2" |  | ||||||
| > exclusive_classes = false | > exclusive_classes = false | ||||||
|  | > length = 262144 | ||||||
| > ngram_size = 1 | > ngram_size = 1 | ||||||
| > no_output_layer = false | > no_output_layer = false | ||||||
| > nO = null | > nO = null | ||||||
|  | @ -1078,17 +1040,108 @@ the others, but may not be as accurate, especially if texts are short. | ||||||
| | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||||
| | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | ||||||
| | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | ||||||
|  | | `length`            | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~                                              | | ||||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
| 
 | 
 | ||||||
| <Accordion title="spacy.TextCatBOW.v1 definition" spaced> | <Accordion title="Previous versions of spacy.TextCatBOW" spaced> | ||||||
| 
 | 
 | ||||||
| [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was | - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2, | ||||||
| not yet resizable. Since v2, new labels can be added to this component, even |   new labels can be added to this component, even after training. | ||||||
| after training. | - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and | ||||||
|  |   [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear | ||||||
|  |   layer that only used a small number of the allocated parameters. | ||||||
|  | - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and | ||||||
|  |   [TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument. | ||||||
| 
 | 
 | ||||||
| </Accordion> | </Accordion> | ||||||
| 
 | 
 | ||||||
|  | ### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"} | ||||||
|  | 
 | ||||||
|  | > #### Example Config | ||||||
|  | > | ||||||
|  | > ```ini | ||||||
|  | > [model] | ||||||
|  | > @architectures = "spacy.TextCatParametricAttention.v1" | ||||||
|  | > exclusive_classes = true | ||||||
|  | > nO = null | ||||||
|  | > | ||||||
|  | > [model.tok2vec] | ||||||
|  | > @architectures = "spacy.Tok2Vec.v2" | ||||||
|  | > | ||||||
|  | > [model.tok2vec.embed] | ||||||
|  | > @architectures = "spacy.MultiHashEmbed.v2" | ||||||
|  | > width = 64 | ||||||
|  | > rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||||
|  | > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||||
|  | > include_static_vectors = false | ||||||
|  | > | ||||||
|  | > [model.tok2vec.encode] | ||||||
|  | > @architectures = "spacy.MaxoutWindowEncoder.v2" | ||||||
|  | > width = ${model.tok2vec.embed.width} | ||||||
|  | > window_size = 1 | ||||||
|  | > maxout_pieces = 3 | ||||||
|  | > depth = 2 | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | A neural network model that is built upon Tok2Vec and uses parametric attention | ||||||
|  | to attend to tokens that are relevant to text classification. | ||||||
|  | 
 | ||||||
|  | | Name                | Description                                                                                                                                                                                    | | ||||||
|  | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | | `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     | | ||||||
|  | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||||
|  | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
|  | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
|  | 
 | ||||||
|  | ### spacy.TextCatReduce.v1 {id="TextCatReduce"} | ||||||
|  | 
 | ||||||
|  | > #### Example Config | ||||||
|  | > | ||||||
|  | > ```ini | ||||||
|  | > [model] | ||||||
|  | > @architectures = "spacy.TextCatReduce.v1" | ||||||
|  | > exclusive_classes = false | ||||||
|  | > use_reduce_first = false | ||||||
|  | > use_reduce_last = false | ||||||
|  | > use_reduce_max = false | ||||||
|  | > use_reduce_mean = true | ||||||
|  | > nO = null | ||||||
|  | > | ||||||
|  | > [model.tok2vec] | ||||||
|  | > @architectures = "spacy.HashEmbedCNN.v2" | ||||||
|  | > pretrained_vectors = null | ||||||
|  | > width = 96 | ||||||
|  | > depth = 4 | ||||||
|  | > embed_size = 2000 | ||||||
|  | > window_size = 1 | ||||||
|  | > maxout_pieces = 3 | ||||||
|  | > subword_features = true | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | A classifier that pools token hidden representations of each `Doc` using first, | ||||||
|  | max or mean reduction and then applies a classification layer. Reductions are | ||||||
|  | concatenated when multiple reductions are used. | ||||||
|  | 
 | ||||||
|  | <Infobox variant="warning" title="Relation to TextCatCNN" id="TextCatCNN"> | ||||||
|  | 
 | ||||||
|  | `TextCatReduce` is a generalization of the older | ||||||
|  | [`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean | ||||||
|  | reduction, whereas `TextCatReduce` also supports first/max reductions. | ||||||
|  | 
 | ||||||
|  | </Infobox> | ||||||
|  | 
 | ||||||
|  | | Name                | Description                                                                                                                                                                                    | | ||||||
|  | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||||
|  | | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        | | ||||||
|  | | `use_reduce_first`  | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~                                                                                                                | | ||||||
|  | | `use_reduce_last`   | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~                                                                                                                 | | ||||||
|  | | `use_reduce_max`    | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~                                                                                                           | | ||||||
|  | | `use_reduce_mean`   | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~                                                                                                                     | | ||||||
|  | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
|  | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
|  | 
 | ||||||
| ## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"} | ## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"} | ||||||
| 
 | 
 | ||||||
| ### spacy.SpanCategorizer.v1 {id="SpanCategorizer"} | ### spacy.SpanCategorizer.v1 {id="SpanCategorizer"} | ||||||
|  |  | ||||||
|  | @ -1268,20 +1268,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is | ||||||
| warmed up before any measurements are taken. | warmed up before any measurements are taken. | ||||||
| 
 | 
 | ||||||
| ```cli | ```cli | ||||||
| $ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] | $ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| | Name                 | Description                                                                                              | | | Name                 | Description                                                                                                                                                                          | | ||||||
| | -------------------- | -------------------------------------------------------------------------------------------------------- | | | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||||
| | `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | | | `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             | | ||||||
| | `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | | | `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             | | ||||||
| | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~           | | | `--code`, `-c`       | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | | ||||||
| | `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                          | | | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       | | ||||||
| | `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                           | | | `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      | | ||||||
| | `--batches`          | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~                         | | | `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       | | ||||||
| | `--warmup`, `-w`     | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~               | | | `--batches`          | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~                                                                                                     | | ||||||
| | `--help`, `-h`       | Show help message and available arguments. ~~bool (flag)~~                                               | | | `--warmup`, `-w`     | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~                                                                                           | | ||||||
| | **PRINTS**           | Pipeline speed in words per second with a 95% confidence interval.                                       | | | `--help`, `-h`       | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           | | ||||||
|  | | **PRINTS**           | Pipeline speed in words per second with a 95% confidence interval.                                                                                                                   | | ||||||
| 
 | 
 | ||||||
| ## apply {id="apply", version="3.5", tag="command"} | ## apply {id="apply", version="3.5", tag="command"} | ||||||
| 
 | 
 | ||||||
|  | @ -1296,6 +1297,9 @@ input formats are: | ||||||
| 
 | 
 | ||||||
| When a directory is provided it is traversed recursively to collect all files. | When a directory is provided it is traversed recursively to collect all files. | ||||||
| 
 | 
 | ||||||
|  | When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved. | ||||||
|  | If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations. | ||||||
|  | 
 | ||||||
| ```bash | ```bash | ||||||
| $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] | $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | @ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned | ||||||
| to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension | to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension | ||||||
| attribute. | attribute. | ||||||
| 
 | 
 | ||||||
|  | > #### Example | ||||||
|  | > | ||||||
|  | > ```python | ||||||
|  | > # Get the last hidden layer output for "is" (token index 1) | ||||||
|  | > doc = nlp("This is a text.") | ||||||
|  | > tensors = doc._.trf_data.last_hidden_layer_state[1] | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
| | Name              | Description                                                                                                                                                                        | | | Name              | Description                                                                                                                                                                        | | ||||||
| | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ | | | `all_outputs`     | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ | | ||||||
|  |  | ||||||
|  | @ -20,10 +20,9 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible | ||||||
| through a generic `llm` | through a generic `llm` | ||||||
| [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories) | [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories) | ||||||
| as well as through task-specific component factories: `llm_ner`, `llm_spancat`, | as well as through task-specific component factories: `llm_ner`, `llm_spancat`, | ||||||
| `llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and | `llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`, | ||||||
| `llm_entity_linker`. | `llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the | ||||||
| 
 | GPT-3-5 model from OpenAI is used by default, but this can be customized. | ||||||
| ### LLMWrapper.\_\_init\_\_ {id="init",tag="method"} |  | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  | @ -33,13 +32,18 @@ as well as through task-specific component factories: `llm_ner`, `llm_spancat`, | ||||||
| > llm = nlp.add_pipe("llm", config=config) | > llm = nlp.add_pipe("llm", config=config) | ||||||
| > | > | ||||||
| > # Construction via add_pipe with a task-specific factory and default GPT3.5 model | > # Construction via add_pipe with a task-specific factory and default GPT3.5 model | ||||||
| > llm = nlp.add_pipe("llm-ner") | > llm = nlp.add_pipe("llm_ner") | ||||||
|  | > | ||||||
|  | > # Construction via add_pipe with a task-specific factory and custom model | ||||||
|  | > llm = nlp.add_pipe("llm_ner", config={"model": {"@llm_models": "spacy.Dolly.v1", "name": "dolly-v2-12b"}}) | ||||||
| > | > | ||||||
| > # Construction from class | > # Construction from class | ||||||
| > from spacy_llm.pipeline import LLMWrapper | > from spacy_llm.pipeline import LLMWrapper | ||||||
| > llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True) | > llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
|  | ### LLMWrapper.\_\_init\_\_ {id="init",tag="method"} | ||||||
|  | 
 | ||||||
| Create a new pipeline instance. In your application, you would normally use a | Create a new pipeline instance. In your application, you would normally use a | ||||||
| shortcut for this and instantiate the component using its string name and | shortcut for this and instantiate the component using its string name and | ||||||
| [`nlp.add_pipe`](/api/language#add_pipe). | [`nlp.add_pipe`](/api/language#add_pipe). | ||||||
|  | @ -225,8 +229,8 @@ All tasks are registered in the `llm_tasks` registry. | ||||||
| dataset across multiple storage units for easier processing and lookups. In | dataset across multiple storage units for easier processing and lookups. In | ||||||
| `spacy-llm` we use this term (synonymously: "mapping") to describe the splitting | `spacy-llm` we use this term (synonymously: "mapping") to describe the splitting | ||||||
| up of prompts if they are too long for a model to handle, and "fusing" | up of prompts if they are too long for a model to handle, and "fusing" | ||||||
| (synonymously: "reducing") to describe how the model responses for several shards | (synonymously: "reducing") to describe how the model responses for several | ||||||
| are merged back together into a single document. | shards are merged back together into a single document. | ||||||
| 
 | 
 | ||||||
| Prompts are broken up in a manner that _always_ keeps the prompt in the template | Prompts are broken up in a manner that _always_ keeps the prompt in the template | ||||||
| intact, meaning that the instructions to the LLM will always stay complete. The | intact, meaning that the instructions to the LLM will always stay complete. The | ||||||
|  | @ -1133,6 +1137,25 @@ supports `.yml`, `.yaml`, `.json` and `.jsonl`. | ||||||
| path = "textcat_examples.json" | path = "textcat_examples.json" | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | If you want to perform few-shot learning with a binary classifier (i. e. a text | ||||||
|  | either should or should not be assigned to a given class), you can provide | ||||||
|  | positive and negative examples with answers of "POS" or "NEG". "POS" means that | ||||||
|  | this example should be assigned the class label defined in the configuration, | ||||||
|  | "NEG" means it shouldn't. E. g. for spam classification: | ||||||
|  | 
 | ||||||
|  | ```json | ||||||
|  | [ | ||||||
|  |   { | ||||||
|  |     "text": "You won the lottery! Wire a fee of 200$ to be able to withdraw your winnings.", | ||||||
|  |     "answer": "POS" | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |     "text": "Your order #123456789 has arrived", | ||||||
|  |     "answer": "NEG" | ||||||
|  |   } | ||||||
|  | ] | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
| ### REL {id="rel"} | ### REL {id="rel"} | ||||||
| 
 | 
 | ||||||
| The REL task extracts relations between named entities. | The REL task extracts relations between named entities. | ||||||
|  | @ -1484,7 +1507,7 @@ These models all take the same parameters: | ||||||
| > ```ini | > ```ini | ||||||
| > [components.llm.model] | > [components.llm.model] | ||||||
| > @llm_models = "spacy.Llama2.v1" | > @llm_models = "spacy.Llama2.v1" | ||||||
| > name = "llama2-7b-hf" | > name = "Llama-2-7b-hf" | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| Currently, these models are provided as part of the core library: | Currently, these models are provided as part of the core library: | ||||||
|  |  | ||||||
|  | @ -162,7 +162,10 @@ network has an internal CNN Tok2Vec layer and uses attention. | ||||||
| 
 | 
 | ||||||
| Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means | Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means | ||||||
| that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not | that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not | ||||||
| yet support that. | yet support that. `TextCatCNN` has been replaced by the more general | ||||||
|  | [`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is | ||||||
|  | identical to `TextCatReduce` with `use_reduce_mean=true`, | ||||||
|  | `use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`. | ||||||
| 
 | 
 | ||||||
| > #### Example Config | > #### Example Config | ||||||
| > | > | ||||||
|  | @ -194,11 +197,58 @@ architecture is usually less accurate than the ensemble, but runs faster. | ||||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
| 
 | 
 | ||||||
|  | ### spacy.TextCatCNN.v2 {id="TextCatCNN_v2"} | ||||||
|  | 
 | ||||||
|  | > #### Example Config | ||||||
|  | > | ||||||
|  | > ```ini | ||||||
|  | > [model] | ||||||
|  | > @architectures = "spacy.TextCatCNN.v2" | ||||||
|  | > exclusive_classes = false | ||||||
|  | > nO = null | ||||||
|  | > | ||||||
|  | > [model.tok2vec] | ||||||
|  | > @architectures = "spacy.HashEmbedCNN.v2" | ||||||
|  | > pretrained_vectors = null | ||||||
|  | > width = 96 | ||||||
|  | > depth = 4 | ||||||
|  | > embed_size = 2000 | ||||||
|  | > window_size = 1 | ||||||
|  | > maxout_pieces = 3 | ||||||
|  | > subword_features = true | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | A neural network model where token vectors are calculated using a CNN. The | ||||||
|  | vectors are mean pooled and used as features in a feed-forward network. This | ||||||
|  | architecture is usually less accurate than the ensemble, but runs faster. | ||||||
|  | 
 | ||||||
|  | `TextCatCNN` has been replaced by the more general | ||||||
|  | [`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is | ||||||
|  | identical to `TextCatReduce` with `use_reduce_mean=true`, | ||||||
|  | `use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`. | ||||||
|  | 
 | ||||||
|  | | Name                | Description                                                                                                                                                                                    | | ||||||
|  | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||||
|  | | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        | | ||||||
|  | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
|  | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
|  | 
 | ||||||
|  | <Accordion title="spacy.TextCatCNN.v1 definition" spaced> | ||||||
|  | 
 | ||||||
|  | [TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was | ||||||
|  | not yet resizable. Since v2, new labels can be added to this component, even | ||||||
|  | after training. | ||||||
|  | 
 | ||||||
|  | </Accordion> | ||||||
|  | 
 | ||||||
| ### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"} | ### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"} | ||||||
| 
 | 
 | ||||||
| Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means | Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means | ||||||
| that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not | that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not | ||||||
| yet support that. | yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an | ||||||
|  | erroneous sparse linear layer that only used a small number of the allocated | ||||||
|  | parameters. | ||||||
| 
 | 
 | ||||||
| > #### Example Config | > #### Example Config | ||||||
| > | > | ||||||
|  | @ -222,6 +272,33 @@ the others, but may not be as accurate, especially if texts are short. | ||||||
| | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
| | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
| 
 | 
 | ||||||
|  | ### spacy.TextCatBOW.v2 {id="TextCatBOW"} | ||||||
|  | 
 | ||||||
|  | Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse | ||||||
|  | linear layer that only used a small number of the allocated parameters. | ||||||
|  | 
 | ||||||
|  | > #### Example Config | ||||||
|  | > | ||||||
|  | > ```ini | ||||||
|  | > [model] | ||||||
|  | > @architectures = "spacy.TextCatBOW.v2" | ||||||
|  | > exclusive_classes = false | ||||||
|  | > ngram_size = 1 | ||||||
|  | > no_output_layer = false | ||||||
|  | > nO = null | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
|  | An n-gram "bag-of-words" model. This architecture should run much faster than | ||||||
|  | the others, but may not be as accurate, especially if texts are short. | ||||||
|  | 
 | ||||||
|  | | Name                | Description                                                                                                                                                                                    | | ||||||
|  | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     | | ||||||
|  | | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           | | ||||||
|  | | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          | | ||||||
|  | | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
|  | | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
|  | 
 | ||||||
| ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} | ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} | ||||||
| 
 | 
 | ||||||
| Identical to | Identical to | ||||||
|  |  | ||||||
|  | @ -89,6 +89,21 @@ architectures and their arguments and hyperparameters. | ||||||
| | `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               | | | `negative_weight` <Tag variant="new">3.5.1</Tag>    | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~                                                                                                               | | ||||||
| | `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        | | | `allow_overlap` <Tag variant="new">3.5.1</Tag>      | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~                                                                                                                                                        | | ||||||
| 
 | 
 | ||||||
|  | <Infobox variant="warning"> | ||||||
|  | 
 | ||||||
|  | If you set a non-default value for `spans_key`, you'll have to update | ||||||
|  | `[training.score_weights]` as well so that weights are computed properly. E. g. | ||||||
|  | for `spans_key == "myspankey"`, include this in your config: | ||||||
|  | 
 | ||||||
|  | ```ini | ||||||
|  | [training.score_weights] | ||||||
|  | spans_myspankey_f = 1.0 | ||||||
|  | spans_myspankey_p = 0.0 | ||||||
|  | spans_myspankey_r = 0.0 | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | </Infobox> | ||||||
|  | 
 | ||||||
| ```python | ```python | ||||||
| %%GITHUB_SPACY/spacy/pipeline/spancat.py | %%GITHUB_SPACY/spacy/pipeline/spancat.py | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | @ -397,6 +397,17 @@ are wrapped into the | ||||||
| by this class. Instances of this class are typically assigned to the | by this class. Instances of this class are typically assigned to the | ||||||
| [`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute. | [`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute. | ||||||
| 
 | 
 | ||||||
|  | > #### Example | ||||||
|  | > | ||||||
|  | > ```python | ||||||
|  | > # Get the last hidden layer output for "is" (token index 1) | ||||||
|  | > doc = nlp("This is a text.") | ||||||
|  | > indices = doc._.trf_data.align[1].data.flatten() | ||||||
|  | > last_hidden_state = doc._.trf_data.model_output.last_hidden_state | ||||||
|  | > dim = last_hidden_state.shape[-1] | ||||||
|  | > tensors = last_hidden_state.reshape(-1, dim)[indices] | ||||||
|  | > ``` | ||||||
|  | 
 | ||||||
| | Name           | Description                                                                                                                                                                                                                                                                                                                          | | | Name           | Description                                                                                                                                                                                                                                                                                                                          | | ||||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||||
| | `tokens`       | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                       | | | `tokens`       | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                       | | ||||||
|  |  | ||||||
|  | @ -13,7 +13,7 @@ between `Doc` objects. | ||||||
| <Infobox variant ="warning"> | <Infobox variant ="warning"> | ||||||
| 
 | 
 | ||||||
| Note that a `Vocab` instance is not static. It increases in size as texts with | Note that a `Vocab` instance is not static. It increases in size as texts with | ||||||
| new tokens are processed. | new tokens are processed. Some models may have an empty vocab at initialization. | ||||||
| 
 | 
 | ||||||
| </Infobox> | </Infobox> | ||||||
| 
 | 
 | ||||||
|  | @ -93,6 +93,7 @@ given string, you need to look it up in | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
| > ```python | > ```python | ||||||
|  | > nlp("I'm eating an apple") | ||||||
| > apple = nlp.vocab.strings["apple"] | > apple = nlp.vocab.strings["apple"] | ||||||
| > oov = nlp.vocab.strings["dskfodkfos"] | > oov = nlp.vocab.strings["dskfodkfos"] | ||||||
| > assert apple in nlp.vocab | > assert apple in nlp.vocab | ||||||
|  |  | ||||||
| Before Width: | Height: | Size: 6.8 KiB After Width: | Height: | Size: 6.8 KiB | 
|  | @ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models: | ||||||
| 
 | 
 | ||||||
| #### CNN/CPU pipelines with floret vectors | #### CNN/CPU pipelines with floret vectors | ||||||
| 
 | 
 | ||||||
| The Finnish, Korean and Swedish `md` and `lg` pipelines use | The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg` | ||||||
| [floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're | pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors. | ||||||
| running a trained pipeline on texts and working with [`Doc`](/api/doc) objects, | If you're running a trained pipeline on texts and working with [`Doc`](/api/doc) | ||||||
| you shouldn't notice any difference with floret vectors. With floret vectors no | objects, you shouldn't notice any difference with floret vectors. With floret | ||||||
| tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will | vectors no tokens are out-of-vocabulary, so | ||||||
| return `False` for all tokens. | [`Token.is_oov`](/api/token#attributes) will return `False` for all tokens. | ||||||
| 
 | 
 | ||||||
| If you access vectors directly for similarity comparisons, there are a few | If you access vectors directly for similarity comparisons, there are a few | ||||||
| differences because floret vectors don't include a fixed word list like the | differences because floret vectors don't include a fixed word list like the | ||||||
|  | @ -132,10 +132,20 @@ vector keys for default vectors. | ||||||
| 
 | 
 | ||||||
| ### Transformer pipeline design {id="design-trf"} | ### Transformer pipeline design {id="design-trf"} | ||||||
| 
 | 
 | ||||||
| In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) | In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if | ||||||
| all listen to the `transformer` component. The `attribute_ruler` and | present) all listen to the `transformer` component. The `attribute_ruler` and | ||||||
| `lemmatizer` have the same configuration as in the CNN models. | `lemmatizer` have the same configuration as in the CNN models. | ||||||
| 
 | 
 | ||||||
|  | For spaCy v3.0-v3.6, `trf` pipelines use | ||||||
|  | [`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the | ||||||
|  | transformer output in `doc._.trf_data` is a | ||||||
|  | [`TransformerData`](/api/transformer#transformerdata) object. | ||||||
|  | 
 | ||||||
|  | For spaCy v3.7+, `trf` pipelines use | ||||||
|  | [`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers) | ||||||
|  | and `doc._.trf_data` is a | ||||||
|  | [`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object. | ||||||
|  | 
 | ||||||
| ### Modifying the default pipeline {id="design-modify"} | ### Modifying the default pipeline {id="design-modify"} | ||||||
| 
 | 
 | ||||||
| For faster processing, you may only want to run a subset of the components in a | For faster processing, you may only want to run a subset of the components in a | ||||||
|  |  | ||||||
|  | @ -31,8 +31,6 @@ for ent in doc.ents: | ||||||
| Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what | Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what | ||||||
| our example sentence and its named entities look like: | our example sentence and its named entities look like: | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={120}> | ||||||
|   title="displaCy visualization of entities" | <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}><mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is looking at buying <mark style={{ background: '#feca74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>U.K. <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>GPE</span></mark> startup for <mark style={{ background: '#e4e7d2', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>$1 billion <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>MONEY</span></mark></div> | ||||||
|   src="/images/displacy-ent1.html" | </Standalone> | ||||||
|   height={100} |  | ||||||
| /> |  | ||||||
|  |  | ||||||
|  | @ -56,8 +56,7 @@ for token in doc: | ||||||
| Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what | Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what | ||||||
| our example sentence and its dependencies look like: | our example sentence and its dependencies look like: | ||||||
| 
 | 
 | ||||||
| <Iframe | <ImageScrollable | ||||||
|   title="displaCy visualization of dependencies and entities" |   src="/images/displacy-long.svg" | ||||||
|   src="/images/displacy-long.html" |   width={1975} | ||||||
|   height={450} |  | ||||||
| /> | /> | ||||||
|  |  | ||||||
|  | @ -153,8 +153,9 @@ maxout_pieces = 3 | ||||||
| depth = 2 | depth = 2 | ||||||
| 
 | 
 | ||||||
| [components.textcat.model.linear_model] | [components.textcat.model.linear_model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| ``` | ``` | ||||||
|  | @ -170,8 +171,9 @@ factory = "textcat" | ||||||
| labels = [] | labels = [] | ||||||
| 
 | 
 | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| nO = null | nO = null | ||||||
|  |  | ||||||
|  | @ -290,11 +290,7 @@ for token in doc: | ||||||
| | toward        | `prep`     | shift     | `NOUN`   | manufacturers           | | | toward        | `prep`     | shift     | `NOUN`   | manufacturers           | | ||||||
| | manufacturers | `pobj`     | toward    | `ADP`    |                         | | | manufacturers | `pobj`     | toward    | `ADP`    |                         | | ||||||
| 
 | 
 | ||||||
| <Iframe | <ImageScrollable src="/images/displacy-long2.svg" width={1275} /> | ||||||
|   title="displaCy visualization of dependencies and entities 2" |  | ||||||
|   src="/images/displacy-long2.html" |  | ||||||
|   height={450} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| Because the syntactic relations form a tree, every word has **exactly one | Because the syntactic relations form a tree, every word has **exactly one | ||||||
| head**. You can therefore iterate over the arcs in the tree by iterating over | head**. You can therefore iterate over the arcs in the tree by iterating over | ||||||
|  | @ -709,11 +705,9 @@ doc = nlp(text) | ||||||
| displacy.serve(doc, style="ent") | displacy.serve(doc, style="ent") | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={180}> | ||||||
|   title="displaCy visualizer for entities" | <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div> | ||||||
|   src="/images/displacy-ent2.html" | </Standalone> | ||||||
|   height={180} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| ## Entity Linking {id="entity-linking"} | ## Entity Linking {id="entity-linking"} | ||||||
| 
 | 
 | ||||||
|  | @ -723,6 +717,10 @@ identifier from a knowledge base (KB). You can create your own | ||||||
| [`KnowledgeBase`](/api/kb) and [train](/usage/training) a new | [`KnowledgeBase`](/api/kb) and [train](/usage/training) a new | ||||||
| [`EntityLinker`](/api/entitylinker) using that custom knowledge base. | [`EntityLinker`](/api/entitylinker) using that custom knowledge base. | ||||||
| 
 | 
 | ||||||
|  | As an example on how to define a KnowledgeBase and train an entity linker model, | ||||||
|  | see [`this tutorial`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson) | ||||||
|  | using [spaCy projects](/usage/projects). | ||||||
|  | 
 | ||||||
| ### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"} | ### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"} | ||||||
| 
 | 
 | ||||||
| The annotated KB identifier is accessible as either a hash value or as a string, | The annotated KB identifier is accessible as either a hash value or as a string, | ||||||
|  | @ -733,6 +731,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a | ||||||
| ```python | ```python | ||||||
| import spacy | import spacy | ||||||
| 
 | 
 | ||||||
|  | # "my_custom_el_pipeline" is assumed to be a custom NLP pipeline that was trained and serialized to disk | ||||||
| nlp = spacy.load("my_custom_el_pipeline") | nlp = spacy.load("my_custom_el_pipeline") | ||||||
| doc = nlp("Ada Lovelace was born in London") | doc = nlp("Ada Lovelace was born in London") | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1328,8 +1328,9 @@ labels = [] | ||||||
| # This function is created and then passed to the "textcat" component as | # This function is created and then passed to the "textcat" component as | ||||||
| # the argument "model" | # the argument "model" | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatBOW.v2" | @architectures = "spacy.TextCatBOW.v3" | ||||||
| exclusive_classes = true | exclusive_classes = true | ||||||
|  | length = 262144 | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| no_output_layer = false | no_output_layer = false | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1144,10 +1144,9 @@ relations and tokens we want to match: | ||||||
| > displacy.serve(doc) | > displacy.serve(doc) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <ImageScrollable | ||||||
|   title="displaCy visualization of dependencies" |   src="/images/displacy-dep-founded.svg" | ||||||
|   src="/images/displacy-dep-founded.html" |   width={925} | ||||||
|   height={450} |  | ||||||
| /> | /> | ||||||
| 
 | 
 | ||||||
| The relations we're interested in are: | The relations we're interested in are: | ||||||
|  |  | ||||||
|  | @ -405,7 +405,7 @@ available to spaCy, all you need to do is install the package in your | ||||||
| environment: | environment: | ||||||
| 
 | 
 | ||||||
| ```bash | ```bash | ||||||
| $ python setup.py develop | $ python -m pip install . | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| spaCy is now able to create the pipeline component `"snek"` – even though you | spaCy is now able to create the pipeline component `"snek"` – even though you | ||||||
|  | @ -586,11 +586,9 @@ After installing the package, the custom colors will be used when visualizing | ||||||
| text with `displacy`. Whenever the label `SNEK` is assigned, it will be | text with `displacy`. Whenever the label `SNEK` is assigned, it will be | ||||||
| displayed in `#3dff74`. | displayed in `#3dff74`. | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={100}> | ||||||
|   title="displaCy visualization of entities" | <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>🌱🌿 <mark style={{ background: '#3dff74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>🐍 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>SNEK</span></mark> ____ 🌳🌲 ____ <mark style={{ background: '#cfc5ff', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>👨🌾 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>HUMAN</span></mark> 🏘️</div> | ||||||
|   src="/images/displacy-ent-snek.html" | </Standalone> | ||||||
|   height={100} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| ## Saving, loading and distributing trained pipelines {id="models"} | ## Saving, loading and distributing trained pipelines {id="models"} | ||||||
| 
 | 
 | ||||||
|  | @ -675,7 +673,7 @@ $ python -m spacy package ./en_example_pipeline ./packages | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| This command will create a pipeline package directory and will run | This command will create a pipeline package directory and will run | ||||||
| `python setup.py sdist` in that directory to create a binary `.whl` file or | `python -m build` in that directory to create a binary `.whl` file or | ||||||
| `.tar.gz` archive of your package that can be installed using `pip install`. | `.tar.gz` archive of your package that can be installed using `pip install`. | ||||||
| Installing the binary wheel is usually more efficient. | Installing the binary wheel is usually more efficient. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -77,11 +77,9 @@ doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")] | ||||||
| displacy.serve(doc, style="span", options={"spans_key": "custom"}) | displacy.serve(doc, style="span", options={"spans_key": "custom"}) | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={100}> | ||||||
|   title="displaCy visualizer for overlapping spans" | <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div> | ||||||
|   src="/images/displacy-span.html" | </Standalone> | ||||||
|   height={180} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| ## Additional features and improvements | ## Additional features and improvements | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -119,11 +119,9 @@ doc = nlp(text) | ||||||
| displacy.serve(doc, style="ent") | displacy.serve(doc, style="ent") | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={180}> | ||||||
|   title="displaCy visualizer for entities" | <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div> | ||||||
|   src="/images/displacy-ent2.html" | </Standalone> | ||||||
|   height={180} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| The entity visualizer lets you customize the following `options`: | The entity visualizer lets you customize the following `options`: | ||||||
| 
 | 
 | ||||||
|  | @ -148,11 +146,9 @@ use the `colors` setting to add your own colors for them. | ||||||
| > displacy.serve(doc, style="ent", options=options) | > displacy.serve(doc, style="ent", options=options) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={225}> | ||||||
|   title="displaCy visualizer for entities (custom styling)" | <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>But <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is starting from behind. The company made a late push into hardware, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Siri, available on iPhones, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Amazon <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.</div> | ||||||
|   src="/images/displacy-ent-custom.html" | </Standalone> | ||||||
|   height={225} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| The above example uses a little trick: Since the background color values are | The above example uses a little trick: Since the background color values are | ||||||
| added as the `background` style attribute, you can use any | added as the `background` style attribute, you can use any | ||||||
|  | @ -197,11 +193,9 @@ doc.spans["sc"] = [ | ||||||
| displacy.serve(doc, style="span") | displacy.serve(doc, style="span") | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={100}> | ||||||
|   title="displaCy visualizer for overlapping spans" | <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div> | ||||||
|   src="/images/displacy-span.html" | </Standalone> | ||||||
|   height={180} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| The span visualizer lets you customize the following `options`: | The span visualizer lets you customize the following `options`: | ||||||
| 
 | 
 | ||||||
|  | @ -223,11 +217,9 @@ specify which one displaCy should use with `spans_key` (`sc` is the default). | ||||||
| > displacy.serve(doc, style="span", options=options) | > displacy.serve(doc, style="span", options=options) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| <Iframe | <Standalone height={100}> | ||||||
|   title="displaCy visualizer for spans (custom spans_key)" | <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#ddd', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#ddd', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>BANK</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span>.</div> | ||||||
|   src="/images/displacy-span-custom.html" | </Standalone> | ||||||
|   height={225} |  | ||||||
| /> |  | ||||||
| 
 | 
 | ||||||
| ## Using displaCy in Jupyter notebooks {id="jupyter"} | ## Using displaCy in Jupyter notebooks {id="jupyter"} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -103,6 +103,10 @@ | ||||||
|             "has_examples": true, |             "has_examples": true, | ||||||
|             "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"] |             "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"] | ||||||
|         }, |         }, | ||||||
|  |         { | ||||||
|  |             "code": "fo", | ||||||
|  |             "name": "Faroese" | ||||||
|  |         }, | ||||||
|         { |         { | ||||||
|             "code": "fr", |             "code": "fr", | ||||||
|             "name": "French", |             "name": "French", | ||||||
|  | @ -290,6 +294,12 @@ | ||||||
|             "example": "Dit is een zin.", |             "example": "Dit is een zin.", | ||||||
|             "has_examples": true |             "has_examples": true | ||||||
|         }, |         }, | ||||||
|  |         { | ||||||
|  |             "code": "nn", | ||||||
|  |             "name": "Norwegian Nynorsk", | ||||||
|  |             "example": "Det er ein meir enn i same periode i fjor.", | ||||||
|  |             "has_examples": true | ||||||
|  |         }, | ||||||
|         { |         { | ||||||
|             "code": "pl", |             "code": "pl", | ||||||
|             "name": "Polish", |             "name": "Polish", | ||||||
|  |  | ||||||
|  | @ -9,14 +9,9 @@ | ||||||
|                     { "text": "Models & Languages", "url": "/usage/models" }, |                     { "text": "Models & Languages", "url": "/usage/models" }, | ||||||
|                     { "text": "Facts & Figures", "url": "/usage/facts-figures" }, |                     { "text": "Facts & Figures", "url": "/usage/facts-figures" }, | ||||||
|                     { "text": "spaCy 101", "url": "/usage/spacy-101" }, |                     { "text": "spaCy 101", "url": "/usage/spacy-101" }, | ||||||
|                     { "text": "New in v3.0", "url": "/usage/v3" }, |                     { "text": "New in v3.7", "url": "/usage/v3-7" }, | ||||||
|                     { "text": "New in v3.1", "url": "/usage/v3-1" }, |  | ||||||
|                     { "text": "New in v3.2", "url": "/usage/v3-2" }, |  | ||||||
|                     { "text": "New in v3.3", "url": "/usage/v3-3" }, |  | ||||||
|                     { "text": "New in v3.4", "url": "/usage/v3-4" }, |  | ||||||
|                     { "text": "New in v3.5", "url": "/usage/v3-5" }, |  | ||||||
|                     { "text": "New in v3.6", "url": "/usage/v3-6" }, |                     { "text": "New in v3.6", "url": "/usage/v3-6" }, | ||||||
|                     { "text": "New in v3.7", "url": "/usage/v3-7" } |                     { "text": "New in v3.5", "url": "/usage/v3-5" } | ||||||
|                 ] |                 ] | ||||||
|             }, |             }, | ||||||
|             { |             { | ||||||
|  |  | ||||||
|  | @ -66,6 +66,10 @@ | ||||||
|                 { |                 { | ||||||
|                     "text": "Stack Overflow", |                     "text": "Stack Overflow", | ||||||
|                     "url": "http://stackoverflow.com/questions/tagged/spacy" |                     "url": "http://stackoverflow.com/questions/tagged/spacy" | ||||||
|  |                 }, | ||||||
|  |                 { | ||||||
|  |                     "text": "Merchandise", | ||||||
|  |                     "url": "https://explosion.ai/merch" | ||||||
|                 } |                 } | ||||||
|             ] |             ] | ||||||
|         }, |         }, | ||||||
|  |  | ||||||
|  | @ -4500,6 +4500,23 @@ | ||||||
|                 "website": "https://nlp.unibuc.ro/people/snisioi.html" |                 "website": "https://nlp.unibuc.ro/people/snisioi.html" | ||||||
|             }, |             }, | ||||||
|             "category": ["pipeline", "training", "models"] |             "category": ["pipeline", "training", "models"] | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |             "id": "redfield-spacy-nodes", | ||||||
|  |             "title": "Redfield NLP Nodes for KNIME", | ||||||
|  |             "slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.", | ||||||
|  |             "description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).", | ||||||
|  |             "github": "Redfield-AB/Spacy-Nodes", | ||||||
|  |             "url": "https://redfield.ai/spacy-redfield/", | ||||||
|  |             "thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png", | ||||||
|  |             "image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png", | ||||||
|  |             "author": "Redfield AB", | ||||||
|  |             "author_links": { | ||||||
|  |                 "twitter": "Redfield_AB", | ||||||
|  |                 "github": "Redfield-AB", | ||||||
|  |                 "website": "https://redfield.ai" | ||||||
|  |             }, | ||||||
|  |             "category": ["standalone"] | ||||||
|         } |         } | ||||||
|     ], |     ], | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
| Before Width: | Height: | Size: 5.1 KiB After Width: | Height: | Size: 5.1 KiB | 
|  | @ -1,80 +0,0 @@ | ||||||
| <div |  | ||||||
|     class="entities" |  | ||||||
|     style=" |  | ||||||
|         line-height: 2.5; |  | ||||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, |  | ||||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; |  | ||||||
|         font-size: 18px; |  | ||||||
|     " |  | ||||||
|     >But |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: linear-gradient(90deg, #aa9cfc, #fc9ce7); |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|         >Google |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >ORG</span |  | ||||||
|         ></mark |  | ||||||
|     >is starting from behind. The company made a late push into hardware, and |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: linear-gradient(90deg, #aa9cfc, #fc9ce7); |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|         >Apple |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >ORG</span |  | ||||||
|         ></mark |  | ||||||
|     >’s Siri, available on iPhones, and |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: linear-gradient(90deg, #aa9cfc, #fc9ce7); |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|         >Amazon |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >ORG</span |  | ||||||
|         ></mark |  | ||||||
|     >’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer |  | ||||||
|     adoption.</div |  | ||||||
| > |  | ||||||
|  | @ -1,59 +0,0 @@ | ||||||
| <div |  | ||||||
|     class="entities" |  | ||||||
|     style=" |  | ||||||
|         line-height: 2.5; |  | ||||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, |  | ||||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; |  | ||||||
|         font-size: 16px; |  | ||||||
|     " |  | ||||||
| > |  | ||||||
|     🌱🌿 |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #3dff74; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|         >🐍 |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >SNEK</span |  | ||||||
|         ></mark |  | ||||||
|     > |  | ||||||
|     ____ 🌳🌲 ____ |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #cfc5ff; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|         >👨🌾 |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >HUMAN</span |  | ||||||
|         ></mark |  | ||||||
|     > |  | ||||||
|     🏘️ |  | ||||||
| </div> |  | ||||||
|  | @ -1,84 +0,0 @@ | ||||||
| <div |  | ||||||
|     class="entities" |  | ||||||
|     style=" |  | ||||||
|         line-height: 2.5; |  | ||||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, |  | ||||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; |  | ||||||
|         font-size: 16px; |  | ||||||
|     " |  | ||||||
| > |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #7aecec; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|     > |  | ||||||
|         Apple |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >ORG</span |  | ||||||
|         > |  | ||||||
|     </mark> |  | ||||||
|     is looking at buying |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #feca74; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|     > |  | ||||||
|         U.K. |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >GPE</span |  | ||||||
|         > |  | ||||||
|     </mark> |  | ||||||
|     startup for |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #e4e7d2; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|     > |  | ||||||
|         $1 billion |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >MONEY</span |  | ||||||
|         > |  | ||||||
|     </mark> |  | ||||||
| </div> |  | ||||||
|  | @ -1,86 +0,0 @@ | ||||||
| <div |  | ||||||
|     class="entities" |  | ||||||
|     style=" |  | ||||||
|         line-height: 2.5; |  | ||||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, |  | ||||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; |  | ||||||
|         font-size: 18px; |  | ||||||
|     " |  | ||||||
| > |  | ||||||
|     When |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #aa9cfc; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|     > |  | ||||||
|         Sebastian Thrun |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >PERSON</span |  | ||||||
|         > |  | ||||||
|     </mark> |  | ||||||
|     started working on self-driving cars at |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #7aecec; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|     > |  | ||||||
|         Google |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >ORG</span |  | ||||||
|         > |  | ||||||
|     </mark> |  | ||||||
|     in |  | ||||||
|     <mark |  | ||||||
|         class="entity" |  | ||||||
|         style=" |  | ||||||
|             background: #bfe1d9; |  | ||||||
|             padding: 0.45em 0.6em; |  | ||||||
|             margin: 0 0.25em; |  | ||||||
|             line-height: 1; |  | ||||||
|             border-radius: 0.35em; |  | ||||||
|         " |  | ||||||
|     > |  | ||||||
|         2007 |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 font-size: 0.8em; |  | ||||||
|                 font-weight: bold; |  | ||||||
|                 line-height: 1; |  | ||||||
|                 border-radius: 0.35em; |  | ||||||
|                 text-transform: uppercase; |  | ||||||
|                 vertical-align: middle; |  | ||||||
|                 margin-left: 0.5rem; |  | ||||||
|             " |  | ||||||
|             >DATE</span |  | ||||||
|         > |  | ||||||
|     </mark> |  | ||||||
|     , few people outside of the company took him seriously. |  | ||||||
| </div> |  | ||||||
| Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB | 
							
								
								
									
										212
									
								
								website/public/images/displacy-long2.svg
									
									
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,212 @@ | ||||||
|  | <svg | ||||||
|  |     xmlns="http://www.w3.org/2000/svg" | ||||||
|  |     xmlns:xlink="http://www.w3.org/1999/xlink" | ||||||
|  |     id="0" | ||||||
|  |     class="displacy" | ||||||
|  |     width="1275" | ||||||
|  |     height="399.5" | ||||||
|  |     style=" | ||||||
|  |         max-width: none; | ||||||
|  |         height: 399.5px; | ||||||
|  |         color: #000000; | ||||||
|  |         background: #ffffff; | ||||||
|  |         font-family: Arial; | ||||||
|  |     " | ||||||
|  | > | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="50">Autonomous</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="225">cars</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="400">shift</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">VERB</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="575">insurance</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="750">liability</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">NOUN</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="925">toward</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">ADP</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5"> | ||||||
|  |         <tspan class="displacy-word" fill="currentColor" x="1100">manufacturers</tspan> | ||||||
|  |         <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">NOUN</tspan> | ||||||
|  |     </text> | ||||||
|  | 
 | ||||||
|  |     <g class="displacy-arrow"> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arc" | ||||||
|  |             id="arrow-0-0" | ||||||
|  |             stroke-width="2px" | ||||||
|  |             d="M70,264.5 C70,177.0 215.0,177.0 215.0,264.5" | ||||||
|  |             fill="none" | ||||||
|  |             stroke="currentColor" | ||||||
|  |         ></path> | ||||||
|  |         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|  |             <textpath | ||||||
|  |                 xlink:href="#arrow-0-0" | ||||||
|  |                 class="displacy-label" | ||||||
|  |                 startOffset="50%" | ||||||
|  |                 fill="currentColor" | ||||||
|  |                 text-anchor="middle" | ||||||
|  |             > | ||||||
|  |                 amod | ||||||
|  |             </textpath> | ||||||
|  |         </text> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arrowhead" | ||||||
|  |             d="M70,266.5 L62,254.5 78,254.5" | ||||||
|  |             fill="currentColor" | ||||||
|  |         ></path> | ||||||
|  |     </g> | ||||||
|  | 
 | ||||||
|  |     <g class="displacy-arrow"> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arc" | ||||||
|  |             id="arrow-0-1" | ||||||
|  |             stroke-width="2px" | ||||||
|  |             d="M245,264.5 C245,177.0 390.0,177.0 390.0,264.5" | ||||||
|  |             fill="none" | ||||||
|  |             stroke="currentColor" | ||||||
|  |         ></path> | ||||||
|  |         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|  |             <textpath | ||||||
|  |                 xlink:href="#arrow-0-1" | ||||||
|  |                 class="displacy-label" | ||||||
|  |                 startOffset="50%" | ||||||
|  |                 fill="currentColor" | ||||||
|  |                 text-anchor="middle" | ||||||
|  |             > | ||||||
|  |                 nsubj | ||||||
|  |             </textpath> | ||||||
|  |         </text> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arrowhead" | ||||||
|  |             d="M245,266.5 L237,254.5 253,254.5" | ||||||
|  |             fill="currentColor" | ||||||
|  |         ></path> | ||||||
|  |     </g> | ||||||
|  | 
 | ||||||
|  |     <g class="displacy-arrow"> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arc" | ||||||
|  |             id="arrow-0-2" | ||||||
|  |             stroke-width="2px" | ||||||
|  |             d="M595,264.5 C595,177.0 740.0,177.0 740.0,264.5" | ||||||
|  |             fill="none" | ||||||
|  |             stroke="currentColor" | ||||||
|  |         ></path> | ||||||
|  |         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|  |             <textpath | ||||||
|  |                 xlink:href="#arrow-0-2" | ||||||
|  |                 class="displacy-label" | ||||||
|  |                 startOffset="50%" | ||||||
|  |                 fill="currentColor" | ||||||
|  |                 text-anchor="middle" | ||||||
|  |             > | ||||||
|  |                 compound | ||||||
|  |             </textpath> | ||||||
|  |         </text> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arrowhead" | ||||||
|  |             d="M595,266.5 L587,254.5 603,254.5" | ||||||
|  |             fill="currentColor" | ||||||
|  |         ></path> | ||||||
|  |     </g> | ||||||
|  | 
 | ||||||
|  |     <g class="displacy-arrow"> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arc" | ||||||
|  |             id="arrow-0-3" | ||||||
|  |             stroke-width="2px" | ||||||
|  |             d="M420,264.5 C420,89.5 745.0,89.5 745.0,264.5" | ||||||
|  |             fill="none" | ||||||
|  |             stroke="currentColor" | ||||||
|  |         ></path> | ||||||
|  |         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|  |             <textpath | ||||||
|  |                 xlink:href="#arrow-0-3" | ||||||
|  |                 class="displacy-label" | ||||||
|  |                 startOffset="50%" | ||||||
|  |                 fill="currentColor" | ||||||
|  |                 text-anchor="middle" | ||||||
|  |             > | ||||||
|  |                 dobj | ||||||
|  |             </textpath> | ||||||
|  |         </text> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arrowhead" | ||||||
|  |             d="M745.0,266.5 L753.0,254.5 737.0,254.5" | ||||||
|  |             fill="currentColor" | ||||||
|  |         ></path> | ||||||
|  |     </g> | ||||||
|  | 
 | ||||||
|  |     <g class="displacy-arrow"> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arc" | ||||||
|  |             id="arrow-0-4" | ||||||
|  |             stroke-width="2px" | ||||||
|  |             d="M420,264.5 C420,2.0 925.0,2.0 925.0,264.5" | ||||||
|  |             fill="none" | ||||||
|  |             stroke="currentColor" | ||||||
|  |         ></path> | ||||||
|  |         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|  |             <textpath | ||||||
|  |                 xlink:href="#arrow-0-4" | ||||||
|  |                 class="displacy-label" | ||||||
|  |                 startOffset="50%" | ||||||
|  |                 fill="currentColor" | ||||||
|  |                 text-anchor="middle" | ||||||
|  |             > | ||||||
|  |                 prep | ||||||
|  |             </textpath> | ||||||
|  |         </text> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arrowhead" | ||||||
|  |             d="M925.0,266.5 L933.0,254.5 917.0,254.5" | ||||||
|  |             fill="currentColor" | ||||||
|  |         ></path> | ||||||
|  |     </g> | ||||||
|  | 
 | ||||||
|  |     <g class="displacy-arrow"> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arc" | ||||||
|  |             id="arrow-0-5" | ||||||
|  |             stroke-width="2px" | ||||||
|  |             d="M945,264.5 C945,177.0 1090.0,177.0 1090.0,264.5" | ||||||
|  |             fill="none" | ||||||
|  |             stroke="currentColor" | ||||||
|  |         ></path> | ||||||
|  |         <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px"> | ||||||
|  |             <textpath | ||||||
|  |                 xlink:href="#arrow-0-5" | ||||||
|  |                 class="displacy-label" | ||||||
|  |                 startOffset="50%" | ||||||
|  |                 fill="currentColor" | ||||||
|  |                 text-anchor="middle" | ||||||
|  |             > | ||||||
|  |                 pobj | ||||||
|  |             </textpath> | ||||||
|  |         </text> | ||||||
|  |         <path | ||||||
|  |             class="displacy-arrowhead" | ||||||
|  |             d="M1090.0,266.5 L1098.0,254.5 1082.0,254.5" | ||||||
|  |             fill="currentColor" | ||||||
|  |         ></path> | ||||||
|  |     </g> | ||||||
|  | </svg> | ||||||
| After Width: | Height: | Size: 6.8 KiB | 
|  | @ -1,84 +0,0 @@ | ||||||
| <div |  | ||||||
|     class="spans" |  | ||||||
|     style=" |  | ||||||
|         line-height: 2.5; |  | ||||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, |  | ||||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; |  | ||||||
|         font-size: 18px; |  | ||||||
|         direction: ltr; |  | ||||||
|     " |  | ||||||
| > |  | ||||||
|     Welcome to the |  | ||||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> |  | ||||||
|         Bank |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #ddd; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #ddd; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 border-top-left-radius: 3px; |  | ||||||
|                 border-bottom-left-radius: 3px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|             <span |  | ||||||
|                 style=" |  | ||||||
|                     background: #ddd; |  | ||||||
|                     color: #000; |  | ||||||
|                     top: -0.5em; |  | ||||||
|                     padding: 2px 3px; |  | ||||||
|                     position: absolute; |  | ||||||
|                     font-size: 0.6em; |  | ||||||
|                     font-weight: bold; |  | ||||||
|                     line-height: 1; |  | ||||||
|                     border-radius: 3px; |  | ||||||
|                 " |  | ||||||
|             > |  | ||||||
|                 BANK |  | ||||||
|             </span> |  | ||||||
|         </span> |  | ||||||
|     </span> |  | ||||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> |  | ||||||
|         of |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #ddd; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|     </span> |  | ||||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> |  | ||||||
|         China |  | ||||||
| 
 |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #ddd; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|     </span> |  | ||||||
|     . |  | ||||||
| </div> |  | ||||||
|  | @ -1,123 +0,0 @@ | ||||||
| <div |  | ||||||
|     class="spans" |  | ||||||
|     style=" |  | ||||||
|         line-height: 2.5; |  | ||||||
|         direction: ltr; |  | ||||||
|         font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, |  | ||||||
|             'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; |  | ||||||
|         font-size: 18px; |  | ||||||
|     " |  | ||||||
| > |  | ||||||
|     Welcome to the |  | ||||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> |  | ||||||
|         Bank |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #7aecec; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #7aecec; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 border-top-left-radius: 3px; |  | ||||||
|                 border-bottom-left-radius: 3px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|             <span |  | ||||||
|                 style=" |  | ||||||
|                     background: #7aecec; |  | ||||||
|                     color: #000; |  | ||||||
|                     top: -0.5em; |  | ||||||
|                     padding: 2px 3px; |  | ||||||
|                     position: absolute; |  | ||||||
|                     font-size: 0.6em; |  | ||||||
|                     font-weight: bold; |  | ||||||
|                     line-height: 1; |  | ||||||
|                     border-radius: 3px; |  | ||||||
|                 " |  | ||||||
|             > |  | ||||||
|                 ORG |  | ||||||
|             </span> |  | ||||||
|         </span> |  | ||||||
|     </span> |  | ||||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> |  | ||||||
|         of |  | ||||||
| 
 |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #7aecec; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|     </span> |  | ||||||
|     <span style="font-weight: bold; display: inline-block; position: relative"> |  | ||||||
|         China |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #7aecec; |  | ||||||
|                 top: 40px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #feca74; |  | ||||||
|                 top: 57px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|         </span> |  | ||||||
|         <span |  | ||||||
|             style=" |  | ||||||
|                 background: #feca74; |  | ||||||
|                 top: 57px; |  | ||||||
|                 height: 4px; |  | ||||||
|                 border-top-left-radius: 3px; |  | ||||||
|                 border-bottom-left-radius: 3px; |  | ||||||
|                 left: -1px; |  | ||||||
|                 width: calc(100% + 2px); |  | ||||||
|                 position: absolute; |  | ||||||
|             " |  | ||||||
|         > |  | ||||||
|             <span |  | ||||||
|                 style=" |  | ||||||
|                     background: #feca74; |  | ||||||
|                     color: #000; |  | ||||||
|                     top: -0.5em; |  | ||||||
|                     padding: 2px 3px; |  | ||||||
|                     position: absolute; |  | ||||||
|                     font-size: 0.6em; |  | ||||||
|                     font-weight: bold; |  | ||||||
|                     line-height: 1; |  | ||||||
|                     border-radius: 3px; |  | ||||||
|                 " |  | ||||||
|             > |  | ||||||
|                 GPE |  | ||||||
|             </span> |  | ||||||
|         </span> |  | ||||||
|     </span> |  | ||||||
|     . |  | ||||||
| </div> |  | ||||||
|  | @ -107,6 +107,22 @@ const Image = ({ src, alt, title, href, ...props }) => { | ||||||
|     ) |     ) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | const ImageScrollable = ({ src, alt, width, ...props }) => { | ||||||
|  |     return ( | ||||||
|  |         <figure className={classNames(classes.standalone, classes.scrollable)}> | ||||||
|  |             <img className={classes['image-scrollable']} src={src} alt={alt} width={width} height="auto" /> | ||||||
|  |         </figure> | ||||||
|  |     ) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | const Standalone = ({ height, children, ...props }) => { | ||||||
|  |     return ( | ||||||
|  |         <figure className={classes.standalone} style={{ height }}> | ||||||
|  |             {children} | ||||||
|  |         </figure> | ||||||
|  |     ) | ||||||
|  | } | ||||||
|  | 
 | ||||||
| const ImageFill = ({ image, ...props }) => { | const ImageFill = ({ image, ...props }) => { | ||||||
|     return ( |     return ( | ||||||
|         <span |         <span | ||||||
|  | @ -137,4 +153,4 @@ const GoogleSheet = ({ id, link, height, button = 'View full table' }) => { | ||||||
|     ) |     ) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| export { YouTube, SoundCloud, Iframe, Image, ImageFill, GoogleSheet } | export { YouTube, SoundCloud, Iframe, Image, ImageFill, ImageScrollable, GoogleSheet, Standalone } | ||||||
|  |  | ||||||
|  | @ -13,7 +13,7 @@ import Aside from './components/aside' | ||||||
| import Button from './components/button' | import Button from './components/button' | ||||||
| import Tag from './components/tag' | import Tag from './components/tag' | ||||||
| import Grid from './components/grid' | import Grid from './components/grid' | ||||||
| import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from './components/embed' | import { YouTube, SoundCloud, Iframe, Image, ImageScrollable, GoogleSheet, Standalone } from './components/embed' | ||||||
| import Project from './widgets/project' | import Project from './widgets/project' | ||||||
| import { Integration, IntegrationLogo } from './widgets/integration.js' | import { Integration, IntegrationLogo } from './widgets/integration.js' | ||||||
| import { Logos, Colors, Patterns } from './widgets/styleguide' | import { Logos, Colors, Patterns } from './widgets/styleguide' | ||||||
|  | @ -90,6 +90,8 @@ export const remarkComponents = { | ||||||
|      * For regular img elements it is not possible to pass properties |      * For regular img elements it is not possible to pass properties | ||||||
|      */ |      */ | ||||||
|     Image, |     Image, | ||||||
|  |     ImageScrollable, | ||||||
|  |     Standalone, | ||||||
| 
 | 
 | ||||||
|     Label, |     Label, | ||||||
|     Logos, |     Logos, | ||||||
|  |  | ||||||