Merge pull request #13286 from explosion/master

Sync `docs/llm_main` with `master`
This commit is contained in:
Raphael Mitsch 2024-01-29 15:32:29 +01:00 committed by GitHub
commit c38fdbe09b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
102 changed files with 1965 additions and 829 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1 @@
custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]

View File

@ -58,7 +58,7 @@ jobs:
fail-fast: true fail-fast: true
matrix: matrix:
os: [ubuntu-latest, windows-latest, macos-latest] os: [ubuntu-latest, windows-latest, macos-latest]
python_version: ["3.11", "3.12.0-rc.2"] python_version: ["3.12"]
include: include:
- os: windows-latest - os: windows-latest
python_version: "3.7" python_version: "3.7"
@ -68,6 +68,8 @@ jobs:
python_version: "3.9" python_version: "3.9"
- os: windows-latest - os: windows-latest
python_version: "3.10" python_version: "3.10"
- os: macos-latest
python_version: "3.11"
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}

View File

@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@ -39,28 +39,35 @@ open-source software, released under the
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | | 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. | | 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | | 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
| 📦 **[Models]** | Download trained pipelines for spaCy. | | 📦 **[Models]** | Download trained pipelines for spaCy. |
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | | 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. | | ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 🛠 **[Changelog]** | Changes and version history. | | 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. | | 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-pipelines)** | | 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more &rarr;](https://explosion.ai/spacy-tailored-analysis)** | | <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCys core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more &rarr;](https://explosion.ai/tailored-solutions)** |
[spacy 101]: https://spacy.io/usage/spacy-101 [spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3 [new in v3.0]: https://spacy.io/usage/v3
[usage guides]: https://spacy.io/usage/ [usage guides]: https://spacy.io/usage/
[api reference]: https://spacy.io/api/ [api reference]: https://spacy.io/api/
[gpu processing]: https://spacy.io/usage#gpu
[models]: https://spacy.io/models [models]: https://spacy.io/models
[large language models]: https://spacy.io/usage/large-language-models
[universe]: https://spacy.io/universe [universe]: https://spacy.io/universe
[spacy vs code extension]: https://github.com/explosion/spacy-vscode [spacy vs code extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI [videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io [online course]: https://course.spacy.io
[blog]: https://explosion.ai
[project templates]: https://github.com/explosion/projects [project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
[swag]: https://explosion.ai/merch
## 💬 Where to ask questions ## 💬 Where to ask questions

View File

@ -158,3 +158,45 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE. SOFTWARE.
SciPy
-----
* Files: scorer.py
The implementation of trapezoid() is adapted from SciPy, which is distributed
under the following license:
New BSD License
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.1.8,<8.3.0", "thinc>=8.2.2,<8.3.0",
"numpy>=1.15.0; python_version < '3.9'", "numpy>=1.15.0; python_version < '3.9'",
"numpy>=1.25.0; python_version >= '3.9'", "numpy>=1.25.0; python_version >= '3.9'",
] ]

View File

@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0 spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.1.8,<8.3.0 thinc>=8.2.2,<8.3.0
ml_datasets>=0.2.0,<0.3.0 ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0

View File

@ -41,7 +41,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.1.8,<8.3.0 thinc>=8.2.2,<8.3.0
install_requires = install_requires =
# Our libraries # Our libraries
spacy-legacy>=3.0.11,<3.1.0 spacy-legacy>=3.0.11,<3.1.0
@ -49,7 +49,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.1.8,<8.3.0 thinc>=8.2.2,<8.3.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0

View File

@ -1,5 +1,5 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.7.1" __version__ = "3.7.2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -22,8 +22,17 @@ from .init_pipeline import init_pipeline_cli # noqa: F401
from .package import package # noqa: F401 from .package import package # noqa: F401
from .pretrain import pretrain # noqa: F401 from .pretrain import pretrain # noqa: F401
from .profile import profile # noqa: F401 from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401 from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401
from .validate import validate # noqa: F401 from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401
from .project.document import ( # type: ignore[attr-defined] # noqa: F401
project_document,
)
from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401
from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401
from .project.push import project_push # type: ignore[attr-defined] # noqa: F401
from .project.run import project_run # type: ignore[attr-defined] # noqa: F401
from .train import train_cli # type: ignore[attr-defined] # noqa: F401
from .validate import validate # type: ignore[attr-defined] # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -13,7 +13,7 @@ from .. import util
from ..language import Language from ..language import Language
from ..tokens import Doc from ..tokens import Doc
from ..training import Corpus from ..training import Corpus
from ._util import Arg, Opt, benchmark_cli, setup_gpu from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command( @benchmark_cli.command(
@ -30,12 +30,14 @@ def benchmark_speed_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
# fmt: on # fmt: on
): ):
""" """
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
data in the binary .spacy format. data in the binary .spacy format.
""" """
import_code(code_path)
setup_gpu(use_gpu=use_gpu, silent=False) setup_gpu(use_gpu=use_gpu, silent=False)
nlp = util.load_model(model) nlp = util.load_model(model)
@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
def warmup( def warmup(
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
) -> numpy.ndarray: ) -> numpy.ndarray:
docs = warmup_epochs * docs docs = [doc.copy() for doc in docs * warmup_epochs]
return annotate(nlp, docs, batch_size) return annotate(nlp, docs, batch_size)

View File

@ -7,7 +7,14 @@ from wasabi import msg
from .. import about from .. import about
from ..errors import OLD_MODEL_SHORTCUTS from ..errors import OLD_MODEL_SHORTCUTS
from ..util import get_minor_version, is_package, is_prerelease_version, run_command from ..util import (
get_minor_version,
is_in_interactive,
is_in_jupyter,
is_package,
is_prerelease_version,
run_command,
)
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@ -77,6 +84,27 @@ def download(
"Download and installation successful", "Download and installation successful",
f"You can now load the package via spacy.load('{model_name}')", f"You can now load the package via spacy.load('{model_name}')",
) )
if is_in_jupyter():
reload_deps_msg = (
"If you are in a Jupyter or Colab notebook, you may need to "
"restart Python in order to load all the package's dependencies. "
"You can do this by selecting the 'Restart kernel' or 'Restart "
"runtime' option."
)
msg.warn(
"Restart to reload dependencies",
reload_deps_msg,
)
elif is_in_interactive():
reload_deps_msg = (
"If you are in an interactive Python session, you may need to "
"exit and restart Python to load all the package's dependencies. "
"You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
)
msg.warn(
"Restart to reload dependencies",
reload_deps_msg,
)
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:

View File

@ -1,5 +1,7 @@
import os
import re import re
import shutil import shutil
import subprocess
import sys import sys
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
@ -11,6 +13,7 @@ from thinc.api import Config
from wasabi import MarkdownRenderer, Printer, get_raw_input from wasabi import MarkdownRenderer, Printer, get_raw_input
from .. import about, util from .. import about, util
from ..compat import importlib_metadata
from ..schemas import ModelMetaSchema, validate from ..schemas import ModelMetaSchema, validate
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@ -35,7 +38,7 @@ def package_cli(
specified output directory, and the data will be copied over. If specified output directory, and the data will be copied over. If
--create-meta is set and a meta.json already exists in the output directory, --create-meta is set and a meta.json already exists in the output directory,
the existing values will be used as the defaults in the command-line prompt. the existing values will be used as the defaults in the command-line prompt.
After packaging, "python setup.py sdist" is run in the package directory, After packaging, "python -m build --sdist" is run in the package directory,
which will create a .tar.gz archive that can be installed via "pip install". which will create a .tar.gz archive that can be installed via "pip install".
If additional code files are provided (e.g. Python files containing custom If additional code files are provided (e.g. Python files containing custom
@ -78,9 +81,17 @@ def package(
input_path = util.ensure_path(input_dir) input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)
if create_wheel and not has_wheel(): if create_wheel and not has_wheel() and not has_build():
err = "Generating a binary .whl file requires wheel to be installed" err = (
msg.fail(err, "pip install wheel", exits=1) "Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
)
msg.fail(err, "pip install build", exits=1)
if not has_build():
msg.warn(
"Generating packages without the 'build' package is deprecated and "
"will not be supported in the future. To install 'build': pip "
"install build"
)
if not input_path or not input_path.exists(): if not input_path or not input_path.exists():
msg.fail("Can't locate pipeline data", input_path, exits=1) msg.fail("Can't locate pipeline data", input_path, exits=1)
if not output_path or not output_path.exists(): if not output_path or not output_path.exists():
@ -184,12 +195,37 @@ def package(
msg.good(f"Successfully created package directory '{model_name_v}'", main_path) msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
if create_sdist: if create_sdist:
with util.working_dir(main_path): with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"], capture=False) # run directly, since util.run_command is not designed to continue
# after a command fails
ret = subprocess.run(
[sys.executable, "-m", "build", ".", "--sdist"],
env=os.environ.copy(),
)
if ret.returncode != 0:
msg.warn(
"Creating sdist with 'python -m build' failed. Falling "
"back to deprecated use of 'python setup.py sdist'"
)
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
msg.good(f"Successfully created zipped Python package", zip_file) msg.good(f"Successfully created zipped Python package", zip_file)
if create_wheel: if create_wheel:
with util.working_dir(main_path): with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) # run directly, since util.run_command is not designed to continue
# after a command fails
ret = subprocess.run(
[sys.executable, "-m", "build", ".", "--wheel"],
env=os.environ.copy(),
)
if ret.returncode != 0:
msg.warn(
"Creating wheel with 'python -m build' failed. Falling "
"back to deprecated use of 'wheel' with "
"'python setup.py bdist_wheel'"
)
util.run_command(
[sys.executable, "setup.py", "bdist_wheel"], capture=False
)
wheel_name_squashed = re.sub("_+", "_", model_name_v) wheel_name_squashed = re.sub("_+", "_", model_name_v)
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
msg.good(f"Successfully created binary wheel", wheel) msg.good(f"Successfully created binary wheel", wheel)
@ -209,6 +245,17 @@ def has_wheel() -> bool:
return False return False
def has_build() -> bool:
# it's very likely that there is a local directory named build/ (especially
# in an editable install), so an import check is not sufficient; instead
# check that there is a package version
try:
importlib_metadata.version("build")
return True
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
return False
def get_third_party_dependencies( def get_third_party_dependencies(
config: Config, exclude: List[str] = util.SimpleFrozenList() config: Config, exclude: List[str] = util.SimpleFrozenList()
) -> List[str]: ) -> List[str]:

View File

View File

@ -0,0 +1 @@
from weasel.cli.assets import *

View File

@ -0,0 +1 @@
from weasel.cli.clone import *

View File

@ -0,0 +1 @@
from weasel.cli.document import *

1
spacy/cli/project/dvc.py Normal file
View File

@ -0,0 +1 @@
from weasel.cli.dvc import *

View File

@ -0,0 +1 @@
from weasel.cli.pull import *

View File

@ -0,0 +1 @@
from weasel.cli.push import *

View File

@ -0,0 +1 @@
from weasel.cli.remote_storage import *

1
spacy/cli/project/run.py Normal file
View File

@ -0,0 +1 @@
from weasel.cli.run import *

View File

@ -271,8 +271,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -308,8 +309,9 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
[components.textcat_multilabel.model.linear_model] [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -542,14 +544,15 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{% else -%} {% else -%}
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -570,15 +573,17 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
[components.textcat_multilabel.model.linear_model] [components.textcat_multilabel.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{% else -%} {% else -%}
[components.textcat_multilabel.model] [components.textcat_multilabel.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
{%- endif %} {%- endif %}

View File

@ -142,7 +142,25 @@ class SpanRenderer:
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
title (str / None): Document title set in Doc.user_data['title']. title (str / None): Document title set in Doc.user_data['title'].
""" """
per_token_info = [] per_token_info = self._assemble_per_token_info(tokens, spans)
markup = self._render_markup(per_token_info)
markup = TPL_SPANS.format(content=markup, dir=self.direction)
if title:
markup = TPL_TITLE.format(title=title) + markup
return markup
@staticmethod
def _assemble_per_token_info(
tokens: List[str], spans: List[Dict[str, Any]]
) -> List[Dict[str, List[Dict[str, Any]]]]:
"""Assembles token info used to generate markup in render_spans().
tokens (List[str]): Tokens in text.
spans (List[Dict[str, Any]]): Spans in text.
RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
and spans.
"""
per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
# we must sort so that we can correctly describe when spans need to "stack" # we must sort so that we can correctly describe when spans need to "stack"
# which is determined by their start token, then span length (longer spans on top), # which is determined by their start token, then span length (longer spans on top),
# then break any remaining ties with the span label # then break any remaining ties with the span label
@ -154,21 +172,22 @@ class SpanRenderer:
s["label"], s["label"],
), ),
) )
for s in spans: for s in spans:
# this is the vertical 'slot' that the span will be rendered in # this is the vertical 'slot' that the span will be rendered in
# vertical_position = span_label_offset + (offset_step * (slot - 1)) # vertical_position = span_label_offset + (offset_step * (slot - 1))
s["render_slot"] = 0 s["render_slot"] = 0
for idx, token in enumerate(tokens): for idx, token in enumerate(tokens):
# Identify if a token belongs to a Span (and which) and if it's a # Identify if a token belongs to a Span (and which) and if it's a
# start token of said Span. We'll use this for the final HTML render # start token of said Span. We'll use this for the final HTML render
token_markup: Dict[str, Any] = {} token_markup: Dict[str, Any] = {}
token_markup["text"] = token token_markup["text"] = token
concurrent_spans = 0 intersecting_spans: List[Dict[str, Any]] = []
entities = [] entities = []
for span in spans: for span in spans:
ent = {} ent = {}
if span["start_token"] <= idx < span["end_token"]: if span["start_token"] <= idx < span["end_token"]:
concurrent_spans += 1
span_start = idx == span["start_token"] span_start = idx == span["start_token"]
ent["label"] = span["label"] ent["label"] = span["label"]
ent["is_start"] = span_start ent["is_start"] = span_start
@ -176,7 +195,12 @@ class SpanRenderer:
# When the span starts, we need to know how many other # When the span starts, we need to know how many other
# spans are on the 'span stack' and will be rendered. # spans are on the 'span stack' and will be rendered.
# This value becomes the vertical render slot for this entire span # This value becomes the vertical render slot for this entire span
span["render_slot"] = concurrent_spans span["render_slot"] = (
intersecting_spans[-1]["render_slot"]
if len(intersecting_spans)
else 0
) + 1
intersecting_spans.append(span)
ent["render_slot"] = span["render_slot"] ent["render_slot"] = span["render_slot"]
kb_id = span.get("kb_id", "") kb_id = span.get("kb_id", "")
kb_url = span.get("kb_url", "#") kb_url = span.get("kb_url", "#")
@ -193,11 +217,8 @@ class SpanRenderer:
span["render_slot"] = 0 span["render_slot"] = 0
token_markup["entities"] = entities token_markup["entities"] = entities
per_token_info.append(token_markup) per_token_info.append(token_markup)
markup = self._render_markup(per_token_info)
markup = TPL_SPANS.format(content=markup, dir=self.direction) return per_token_info
if title:
markup = TPL_TITLE.format(title=title) + markup
return markup
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
"""Render the markup from per-token information""" """Render the markup from per-token information"""

View File

@ -227,7 +227,6 @@ class Errors(metaclass=ErrorsWithCodes):
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). " E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
"This usually happens when spaCy calls `nlp.{method}` with a custom " "This usually happens when spaCy calls `nlp.{method}` with a custom "
"component name that's not registered on the current language class. " "component name that's not registered on the current language class. "
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
"If you're using a custom component, make sure you've added the " "If you're using a custom component, make sure you've added the "
"decorator `@Language.component` (for function components) or " "decorator `@Language.component` (for function components) or "
"`@Language.factory` (for class components).\n\nAvailable " "`@Language.factory` (for class components).\n\nAvailable "
@ -984,6 +983,10 @@ class Errors(metaclass=ErrorsWithCodes):
"predicted docs when training {component}.") "predicted docs when training {component}.")
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, " E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
"but only callbacks with one or three parameters are supported") "but only callbacks with one or three parameters are supported")
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
E1057 = ("The `TextCatReduce` architecture must be used with at least one "
"reduction. Please enable one of `use_reduce_first`, "
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,3 +1,11 @@
from .candidate import Candidate, get_candidates, get_candidates_batch from .candidate import Candidate, get_candidates, get_candidates_batch
from .kb import KnowledgeBase from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB from .kb_in_memory import InMemoryLookupKB
__all__ = [
"Candidate",
"KnowledgeBase",
"InMemoryLookupKB",
"get_candidates",
"get_candidates_batch",
]

View File

@ -6,7 +6,8 @@ _num_words = [
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion" "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion",
"septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion"
] ]
_ordinal_words = [ _ordinal_words = [
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
@ -14,7 +15,8 @@ _ordinal_words = [
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth", "eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
"trillionth", "quadrillionth", "gajillionth", "bazillionth" "trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth",
"octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth"
] ]
# fmt: on # fmt: on

18
spacy/lang/fo/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from ...language import BaseDefaults, Language
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class FaroeseDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
prefixes = TOKENIZER_PREFIXES
class Faroese(Language):
lang = "fo"
Defaults = FaroeseDefaults
__all__ = ["Faroese"]

View File

@ -0,0 +1,90 @@
from ...symbols import ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
for orth in [
"apr.",
"aug.",
"avgr.",
"árg.",
"ávís.",
"beinl.",
"blkv.",
"blaðkv.",
"blm.",
"blaðm.",
"bls.",
"blstj.",
"blaðstj.",
"des.",
"eint.",
"febr.",
"fyrrv.",
"góðk.",
"h.m.",
"innt.",
"jan.",
"kl.",
"m.a.",
"mðr.",
"mió.",
"nr.",
"nto.",
"nov.",
"nút.",
"o.a.",
"o.a.m.",
"o.a.tíl.",
"o.fl.",
"ff.",
"o.m.a.",
"o.o.",
"o.s.fr.",
"o.tíl.",
"o.ø.",
"okt.",
"omf.",
"pst.",
"ritstj.",
"sbr.",
"sms.",
"smst.",
"smb.",
"sb.",
"sbrt.",
"sp.",
"sept.",
"spf.",
"spsk.",
"t.e.",
"t.s.",
"t.s.s.",
"tlf.",
"tel.",
"tsk.",
"t.o.v.",
"t.d.",
"uml.",
"ums.",
"uppl.",
"upprfr.",
"uppr.",
"útg.",
"útl.",
"útr.",
"vanl.",
"v.",
"v.h.",
"v.ø.o.",
"viðm.",
"viðv.",
"vm.",
"v.m.",
]:
_exc[orth] = [{ORTH: orth}]
capitalized = orth.capitalize()
_exc[capitalized] = [{ORTH: capitalized}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

20
spacy/lang/nn/__init__.py Normal file
View File

@ -0,0 +1,20 @@
from ...language import BaseDefaults, Language
from ..nb import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class NorwegianNynorskDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
syntax_iterators = SYNTAX_ITERATORS
class NorwegianNynorsk(Language):
lang = "nn"
Defaults = NorwegianNynorskDefaults
__all__ = ["NorwegianNynorsk"]

15
spacy/lang/nn/examples.py Normal file
View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.nn.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
sentences = [
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
"Det er ein meir enn i same periode i fjor.",
"Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
"Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
]

View File

@ -0,0 +1,74 @@
from ..char_classes import (
ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
UNITS,
)
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
_list_punct = [x for x in LIST_PUNCT if x != "#"]
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
_prefixes = (
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ _list_punct
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_infixes = (
LIST_ELLIPSES
+ _list_icons
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
]
)
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ _list_quotes
+ _list_icons
+ ["", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
+ [r"(?<=[^sSxXzZ])'"]
)
_suffixes += [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "s", "S", r"\'"]
]
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -0,0 +1,228 @@
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {}
for exc_data in [
{ORTH: "jan.", NORM: "januar"},
{ORTH: "feb.", NORM: "februar"},
{ORTH: "mar.", NORM: "mars"},
{ORTH: "apr.", NORM: "april"},
{ORTH: "jun.", NORM: "juni"},
# note: "jul." is in the simple list below without a NORM exception
{ORTH: "aug.", NORM: "august"},
{ORTH: "sep.", NORM: "september"},
{ORTH: "okt.", NORM: "oktober"},
{ORTH: "nov.", NORM: "november"},
{ORTH: "des.", NORM: "desember"},
]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"Ap.",
"Aq.",
"Ca.",
"Chr.",
"Co.",
"Dr.",
"F.eks.",
"Fr.p.",
"Frp.",
"Grl.",
"Kr.",
"Kr.F.",
"Kr.F.s",
"Mr.",
"Mrs.",
"Pb.",
"Pr.",
"Sp.",
"St.",
"a.m.",
"ad.",
"adm.dir.",
"adr.",
"b.c.",
"bl.a.",
"bla.",
"bm.",
"bnr.",
"bto.",
"c.c.",
"ca.",
"cand.mag.",
"co.",
"d.d.",
"d.m.",
"d.y.",
"dept.",
"dr.",
"dr.med.",
"dr.philos.",
"dr.psychol.",
"dss.",
"dvs.",
"e.Kr.",
"e.l.",
"eg.",
"eig.",
"ekskl.",
"el.",
"et.",
"etc.",
"etg.",
"ev.",
"evt.",
"f.",
"f.Kr.",
"f.eks.",
"f.o.m.",
"fhv.",
"fk.",
"foreg.",
"fork.",
"fv.",
"fvt.",
"g.",
"gl.",
"gno.",
"gnr.",
"grl.",
"gt.",
"h.r.adv.",
"hhv.",
"hoh.",
"hr.",
"ifb.",
"ifm.",
"iht.",
"inkl.",
"istf.",
"jf.",
"jr.",
"jul.",
"juris.",
"kfr.",
"kgl.",
"kgl.res.",
"kl.",
"komm.",
"kr.",
"kst.",
"lat.",
"lø.",
"m.a.",
"m.a.o.",
"m.fl.",
"m.m.",
"m.v.",
"ma.",
"mag.art.",
"md.",
"mfl.",
"mht.",
"mill.",
"min.",
"mnd.",
"moh.",
"mrd.",
"muh.",
"mv.",
"mva.",
"n.å.",
"ndf.",
"nr.",
"nto.",
"nyno.",
"o.a.",
"o.l.",
"obl.",
"off.",
"ofl.",
"on.",
"op.",
"org.",
"osv.",
"ovf.",
"p.",
"p.a.",
"p.g.a.",
"p.m.",
"p.t.",
"pga.",
"ph.d.",
"pkt.",
"pr.",
"pst.",
"pt.",
"red.anm.",
"ref.",
"res.",
"res.kap.",
"resp.",
"rv.",
"s.",
"s.d.",
"s.k.",
"s.u.",
"s.å.",
"sen.",
"sep.",
"siviling.",
"sms.",
"snr.",
"spm.",
"sr.",
"sst.",
"st.",
"st.meld.",
"st.prp.",
"stip.",
"stk.",
"stud.",
"sv.",
"såk.",
"sø.",
"t.d.",
"t.h.",
"t.o.m.",
"t.v.",
"temp.",
"ti.",
"tils.",
"tilsv.",
"tl;dr",
"tlf.",
"to.",
"ult.",
"utg.",
"v.",
"vedk.",
"vedr.",
"vg.",
"vgs.",
"vha.",
"vit.ass.",
"vn.",
"vol.",
"vs.",
"vsa.",
"§§",
"©NTB",
"årg.",
"årh.",
]:
_exc[orth] = [{ORTH: orth}]
# Dates
for h in range(1, 31 + 1):
for period in ["."]:
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
_exc.update(_custom_base_exc)
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -1683,6 +1683,12 @@ class Language:
for proc in procs: for proc in procs:
proc.start() proc.start()
# Close writing-end of channels. This is needed to avoid that reading
# from the channel blocks indefinitely when the worker closes the
# channel.
for tx in bytedocs_send_ch:
tx.close()
# Cycle channels not to break the order of docs. # Cycle channels not to break the order of docs.
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable. # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
byte_tuples = chain.from_iterable( byte_tuples = chain.from_iterable(
@ -1705,8 +1711,23 @@ class Language:
# tell `sender` that one batch was consumed. # tell `sender` that one batch was consumed.
sender.step() sender.step()
finally: finally:
# If we are stopping in an orderly fashion, the workers' queues
# are empty. Put the sentinel in their queues to signal that work
# is done, so that they can exit gracefully.
for q in texts_q:
q.put(_WORK_DONE_SENTINEL)
# Otherwise, we are stopping because the error handler raised an
# exception. The sentinel will be last to go out of the queue.
# To avoid doing unnecessary work or hanging on platforms that
# block on sending (Windows), we'll close our end of the channel.
# This signals to the worker that it can exit the next time it
# attempts to send data down the channel.
for r in bytedocs_recv_ch:
r.close()
for proc in procs: for proc in procs:
proc.terminate() proc.join()
def _link_components(self) -> None: def _link_components(self) -> None:
"""Register 'listeners' within pipeline components, to allow them to """Register 'listeners' within pipeline components, to allow them to
@ -2323,6 +2344,11 @@ def _apply_pipes(
while True: while True:
try: try:
texts_with_ctx = receiver.get() texts_with_ctx = receiver.get()
# Stop working if we encounter the end-of-work sentinel.
if isinstance(texts_with_ctx, _WorkDoneSentinel):
return
docs = ( docs = (
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
) )
@ -2331,11 +2357,21 @@ def _apply_pipes(
# Connection does not accept unpickable objects, so send list. # Connection does not accept unpickable objects, so send list.
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs] byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs)) padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
sender.send(byte_docs + padding) # type: ignore[operator] data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = (
byte_docs + padding # type: ignore[operator]
)
except Exception: except Exception:
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))] error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
padding = [(None, None, None)] * (len(texts_with_ctx) - 1) padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
sender.send(error_msg + padding) data = error_msg + padding
try:
sender.send(data)
except BrokenPipeError:
# Parent has closed the pipe prematurely. This happens when a
# worker encounters an error and the error handler is set to
# stop processing.
return
class _Sender: class _Sender:
@ -2365,3 +2401,10 @@ class _Sender:
if self.count >= self.chunk_size: if self.count >= self.chunk_size:
self.count = 0 self.count = 0
self.send() self.send()
class _WorkDoneSentinel:
pass
_WORK_DONE_SENTINEL = _WorkDoneSentinel()

View File

@ -3,4 +3,4 @@ from .levenshtein import levenshtein
from .matcher import Matcher from .matcher import Matcher
from .phrasematcher import PhraseMatcher from .phrasematcher import PhraseMatcher
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"] __all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]

View File

@ -1,21 +1,27 @@
from functools import partial from functools import partial
from typing import List, Optional, cast from typing import List, Optional, Tuple, cast
from thinc.api import ( from thinc.api import (
Dropout, Dropout,
Gelu,
LayerNorm, LayerNorm,
Linear, Linear,
Logistic, Logistic,
Maxout, Maxout,
Model, Model,
ParametricAttention, ParametricAttention,
ParametricAttention_v2,
Relu, Relu,
Softmax, Softmax,
SparseLinear, SparseLinear,
SparseLinear_v2,
chain, chain,
clone, clone,
concatenate, concatenate,
list2ragged, list2ragged,
reduce_first,
reduce_last,
reduce_max,
reduce_mean, reduce_mean,
reduce_sum, reduce_sum,
residual, residual,
@ -25,9 +31,10 @@ from thinc.api import (
) )
from thinc.layers.chain import init as init_chain from thinc.layers.chain import init as init_chain
from thinc.layers.resizable import resize_linear_weighted, resize_model from thinc.layers.resizable import resize_linear_weighted, resize_model
from thinc.types import Floats2d from thinc.types import ArrayXd, Floats2d
from ...attrs import ORTH from ...attrs import ORTH
from ...errors import Errors
from ...tokens import Doc from ...tokens import Doc
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
@ -47,39 +54,15 @@ def build_simple_cnn_text_classifier(
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
is applied instead, so that outputs are in the range [0, 1]. is applied instead, so that outputs are in the range [0, 1].
""" """
fill_defaults = {"b": 0, "W": 0} return build_reduce_text_classifier(
with Model.define_operators({">>": chain}): tok2vec=tok2vec,
cnn = tok2vec >> list2ragged() >> reduce_mean() exclusive_classes=exclusive_classes,
nI = tok2vec.maybe_get_dim("nO") use_reduce_first=False,
if exclusive_classes: use_reduce_last=False,
output_layer = Softmax(nO=nO, nI=nI) use_reduce_max=False,
fill_defaults["b"] = NEG_VALUE use_reduce_mean=True,
resizable_layer: Model = resizable( nO=nO,
output_layer, )
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer
else:
output_layer = Linear(nO=nO, nI=nI)
resizable_layer = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer >> Logistic()
model.set_ref("output_layer", output_layer)
model.attrs["resize_output"] = partial(
resize_and_set_ref,
resizable_layer=resizable_layer,
)
model.set_ref("tok2vec", tok2vec)
if nO is not None:
model.set_dim("nO", cast(int, nO))
model.attrs["multi_label"] = not exclusive_classes
return model
def resize_and_set_ref(model, new_nO, resizable_layer): def resize_and_set_ref(model, new_nO, resizable_layer):
@ -95,10 +78,48 @@ def build_bow_text_classifier(
ngram_size: int, ngram_size: int,
no_output_layer: bool, no_output_layer: bool,
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
return _build_bow_text_classifier(
exclusive_classes=exclusive_classes,
ngram_size=ngram_size,
no_output_layer=no_output_layer,
nO=nO,
sparse_linear=SparseLinear(nO=nO),
)
@registry.architectures("spacy.TextCatBOW.v3")
def build_bow_text_classifier_v3(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
length: int = 262144,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
if length < 1:
raise ValueError(Errors.E1056.format(length=length))
# Find k such that 2**(k-1) < length <= 2**k.
length = 2 ** (length - 1).bit_length()
return _build_bow_text_classifier(
exclusive_classes=exclusive_classes,
ngram_size=ngram_size,
no_output_layer=no_output_layer,
nO=nO,
sparse_linear=SparseLinear_v2(nO=nO, length=length),
)
def _build_bow_text_classifier(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
fill_defaults = {"b": 0, "W": 0} fill_defaults = {"b": 0, "W": 0}
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO=nO)
output_layer = None output_layer = None
if not no_output_layer: if not no_output_layer:
fill_defaults["b"] = NEG_VALUE fill_defaults["b"] = NEG_VALUE
@ -127,6 +148,9 @@ def build_text_classifier_v2(
linear_model: Model[List[Doc], Floats2d], linear_model: Model[List[Doc], Floats2d],
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]: ) -> Model[List[Doc], Floats2d]:
# TODO: build the model with _build_parametric_attention_with_residual_nonlinear
# in spaCy v4. We don't do this in spaCy v3 to preserve model
# compatibility.
exclusive_classes = not linear_model.attrs["multi_label"] exclusive_classes = not linear_model.attrs["multi_label"]
with Model.define_operators({">>": chain, "|": concatenate}): with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.maybe_get_dim("nO") width = tok2vec.maybe_get_dim("nO")
@ -190,3 +214,145 @@ def build_text_classifier_lowdata(
model = model >> Dropout(dropout) model = model >> Dropout(dropout)
model = model >> Logistic() model = model >> Logistic()
return model return model
@registry.architectures("spacy.TextCatParametricAttention.v1")
def build_textcat_parametric_attention_v1(
tok2vec: Model[List[Doc], List[Floats2d]],
exclusive_classes: bool,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
width = tok2vec.maybe_get_dim("nO")
parametric_attention = _build_parametric_attention_with_residual_nonlinear(
tok2vec=tok2vec,
nonlinear_layer=Maxout(nI=width, nO=width),
key_transform=Gelu(nI=width, nO=width),
)
with Model.define_operators({">>": chain}):
if exclusive_classes:
output_layer = Softmax(nO=nO)
else:
output_layer = Linear(nO=nO) >> Logistic()
model = parametric_attention >> output_layer
if model.has_dim("nO") is not False and nO is not None:
model.set_dim("nO", cast(int, nO))
model.set_ref("output_layer", output_layer)
model.attrs["multi_label"] = not exclusive_classes
return model
def _build_parametric_attention_with_residual_nonlinear(
*,
tok2vec: Model[List[Doc], List[Floats2d]],
nonlinear_layer: Model[Floats2d, Floats2d],
key_transform: Optional[Model[Floats2d, Floats2d]] = None,
) -> Model[List[Doc], Floats2d]:
with Model.define_operators({">>": chain, "|": concatenate}):
width = tok2vec.maybe_get_dim("nO")
attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
norm_layer = LayerNorm(nI=width)
parametric_attention = (
tok2vec
>> list2ragged()
>> attention_layer
>> reduce_sum()
>> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
)
parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
parametric_attention.set_ref("tok2vec", tok2vec)
parametric_attention.set_ref("attention_layer", attention_layer)
parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
parametric_attention.set_ref("norm_layer", norm_layer)
return parametric_attention
def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
tok2vec_width = get_tok2vec_width(model)
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
init_chain(model, X, Y)
return model
@registry.architectures("spacy.TextCatReduce.v1")
def build_reduce_text_classifier(
tok2vec: Model,
exclusive_classes: bool,
use_reduce_first: bool,
use_reduce_last: bool,
use_reduce_max: bool,
use_reduce_mean: bool,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
"""Build a model that classifies pooled `Doc` representations.
Pooling is performed using reductions. Reductions are concatenated when
multiple reductions are used.
tok2vec (Model): the tok2vec layer to pool over.
exclusive_classes (bool): Whether or not classes are mutually exclusive.
use_reduce_first (bool): Pool by using the hidden representation of the
first token of a `Doc`.
use_reduce_last (bool): Pool by using the hidden representation of the
last token of a `Doc`.
use_reduce_max (bool): Pool by taking the maximum values of the hidden
representations of a `Doc`.
use_reduce_mean (bool): Pool by taking the mean of all hidden
representations of a `Doc`.
nO (Optional[int]): Number of classes.
"""
fill_defaults = {"b": 0, "W": 0}
reductions = []
if use_reduce_first:
reductions.append(reduce_first())
if use_reduce_last:
reductions.append(reduce_last())
if use_reduce_max:
reductions.append(reduce_max())
if use_reduce_mean:
reductions.append(reduce_mean())
if not len(reductions):
raise ValueError(Errors.E1057)
with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> concatenate(*reductions)
nO_tok2vec = tok2vec.maybe_get_dim("nO")
nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nI)
fill_defaults["b"] = NEG_VALUE
resizable_layer: Model = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer
else:
output_layer = Linear(nO=nO, nI=nI)
resizable_layer = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer >> Logistic()
model.set_ref("output_layer", output_layer)
model.attrs["resize_output"] = partial(
resize_and_set_ref,
resizable_layer=resizable_layer,
)
model.set_ref("tok2vec", tok2vec)
if nO is not None:
model.set_dim("nO", cast(int, nO))
model.attrs["multi_label"] = not exclusive_classes
return model

View File

@ -22,6 +22,7 @@ from .trainable_pipe import TrainablePipe
__all__ = [ __all__ = [
"AttributeRuler", "AttributeRuler",
"DependencyParser", "DependencyParser",
"EditTreeLemmatizer",
"EntityLinker", "EntityLinker",
"EntityRecognizer", "EntityRecognizer",
"EntityRuler", "EntityRuler",

View File

@ -29,7 +29,7 @@ cdef class StateClass:
return [self.B(i) for i in range(self.c.buffer_length())] return [self.B(i) for i in range(self.c.buffer_length())]
@property @property
def token_vector_lenth(self): def token_vector_length(self):
return self.doc.tensor.shape[1] return self.doc.tensor.shape[1]
@property @property

View File

@ -36,8 +36,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[model.linear_model] [model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -45,16 +46,21 @@ DEFAULT_SINGLE_TEXTCAT_MODEL = Config().from_str(single_label_default_config)["m
single_label_bow_config = """ single_label_bow_config = """
[model] [model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
single_label_cnn_config = """ single_label_cnn_config = """
[model] [model]
@architectures = "spacy.TextCatCNN.v2" @architectures = "spacy.TextCatReduce.v1"
exclusive_classes = true exclusive_classes = true
use_reduce_first = false
use_reduce_last = false
use_reduce_max = false
use_reduce_mean = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2" @architectures = "spacy.HashEmbedCNN.v2"

View File

@ -35,8 +35,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[model.linear_model] [model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -44,7 +45,7 @@ DEFAULT_MULTI_TEXTCAT_MODEL = Config().from_str(multi_label_default_config)["mod
multi_label_bow_config = """ multi_label_bow_config = """
[model] [model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = false exclusive_classes = false
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
@ -52,8 +53,12 @@ no_output_layer = false
multi_label_cnn_config = """ multi_label_cnn_config = """
[model] [model]
@architectures = "spacy.TextCatCNN.v2" @architectures = "spacy.TextCatReduce.v1"
exclusive_classes = false exclusive_classes = false
use_reduce_first = false
use_reduce_last = false
use_reduce_max = false
use_reduce_mean = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2" @architectures = "spacy.HashEmbedCNN.v2"

View File

@ -802,6 +802,140 @@ def get_ner_prf(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
} }
# The following implementation of trapezoid() is adapted from SciPy,
# which is distributed under the New BSD License.
# Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
# See licenses/3rd_party_licenses.txt
def trapezoid(y, x=None, dx=1.0, axis=-1):
r"""
Integrate along the given axis using the composite trapezoidal rule.
If `x` is provided, the integration happens in sequence along its
elements - they are not sorted.
Integrate `y` (`x`) along each 1d slice on the given axis, compute
:math:`\int y(x) dx`.
When `x` is specified, this integrates along the parametric curve,
computing :math:`\int_t y(t) dt =
\int_t y(t) \left.\frac{dx}{dt}\right|_{x=x(t)} dt`.
Parameters
----------
y : array_like
Input array to integrate.
x : array_like, optional
The sample points corresponding to the `y` values. If `x` is None,
the sample points are assumed to be evenly spaced `dx` apart. The
default is None.
dx : scalar, optional
The spacing between sample points when `x` is None. The default is 1.
axis : int, optional
The axis along which to integrate.
Returns
-------
trapezoid : float or ndarray
Definite integral of `y` = n-dimensional array as approximated along
a single axis by the trapezoidal rule. If `y` is a 1-dimensional array,
then the result is a float. If `n` is greater than 1, then the result
is an `n`-1 dimensional array.
See Also
--------
cumulative_trapezoid, simpson, romb
Notes
-----
Image [2]_ illustrates trapezoidal rule -- y-axis locations of points
will be taken from `y` array, by default x-axis distances between
points will be 1.0, alternatively they can be provided with `x` array
or with `dx` scalar. Return value will be equal to combined area under
the red lines.
References
----------
.. [1] Wikipedia page: https://en.wikipedia.org/wiki/Trapezoidal_rule
.. [2] Illustration image:
https://en.wikipedia.org/wiki/File:Composite_trapezoidal_rule_illustration.png
Examples
--------
Use the trapezoidal rule on evenly spaced points:
>>> import numpy as np
>>> from scipy import integrate
>>> integrate.trapezoid([1, 2, 3])
4.0
The spacing between sample points can be selected by either the
``x`` or ``dx`` arguments:
>>> integrate.trapezoid([1, 2, 3], x=[4, 6, 8])
8.0
>>> integrate.trapezoid([1, 2, 3], dx=2)
8.0
Using a decreasing ``x`` corresponds to integrating in reverse:
>>> integrate.trapezoid([1, 2, 3], x=[8, 6, 4])
-8.0
More generally ``x`` is used to integrate along a parametric curve. We can
estimate the integral :math:`\int_0^1 x^2 = 1/3` using:
>>> x = np.linspace(0, 1, num=50)
>>> y = x**2
>>> integrate.trapezoid(y, x)
0.33340274885464394
Or estimate the area of a circle, noting we repeat the sample which closes
the curve:
>>> theta = np.linspace(0, 2 * np.pi, num=1000, endpoint=True)
>>> integrate.trapezoid(np.cos(theta), x=np.sin(theta))
3.141571941375841
``trapezoid`` can be applied along a specified axis to do multiple
computations in one call:
>>> a = np.arange(6).reshape(2, 3)
>>> a
array([[0, 1, 2],
[3, 4, 5]])
>>> integrate.trapezoid(a, axis=0)
array([1.5, 2.5, 3.5])
>>> integrate.trapezoid(a, axis=1)
array([2., 8.])
"""
y = np.asanyarray(y)
if x is None:
d = dx
else:
x = np.asanyarray(x)
if x.ndim == 1:
d = np.diff(x)
# reshape to correct shape
shape = [1] * y.ndim
shape[axis] = d.shape[0]
d = d.reshape(shape)
else:
d = np.diff(x, axis=axis)
nd = y.ndim
slice1 = [slice(None)] * nd
slice2 = [slice(None)] * nd
slice1[axis] = slice(1, None)
slice2[axis] = slice(None, -1)
try:
ret = (d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0).sum(axis)
except ValueError:
# Operations didn't work, cast to ndarray
d = np.asarray(d)
y = np.asarray(y)
ret = np.add.reduce(d * (y[tuple(slice1)] + y[tuple(slice2)]) / 2.0, axis)
return ret
# The following implementation of roc_auc_score() is adapted from # The following implementation of roc_auc_score() is adapted from
# scikit-learn, which is distributed under the New BSD License. # scikit-learn, which is distributed under the New BSD License.
# Copyright (c) 20072019 The scikit-learn developers. # Copyright (c) 20072019 The scikit-learn developers.
@ -1024,9 +1158,9 @@ def _auc(x, y):
else: else:
raise ValueError(Errors.E164.format(x=x)) raise ValueError(Errors.E164.format(x=x))
area = direction * np.trapz(y, x) area = direction * trapezoid(y, x)
if isinstance(area, np.memmap): if isinstance(area, np.memmap):
# Reductions such as .sum used internally in np.trapz do not return a # Reductions such as .sum used internally in trapezoid do not return a
# scalar by default for numpy.memmap instances contrary to # scalar by default for numpy.memmap instances contrary to
# regular numpy.ndarray instances. # regular numpy.ndarray instances.
area = area.dtype.type(area) area = area.dtype.type(area)

View File

@ -162,6 +162,11 @@ def fi_tokenizer():
return get_lang_class("fi")().tokenizer return get_lang_class("fi")().tokenizer
@pytest.fixture(scope="session")
def fo_tokenizer():
return get_lang_class("fo")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def fr_tokenizer(): def fr_tokenizer():
return get_lang_class("fr")().tokenizer return get_lang_class("fr")().tokenizer
@ -317,6 +322,11 @@ def nl_tokenizer():
return get_lang_class("nl")().tokenizer return get_lang_class("nl")().tokenizer
@pytest.fixture(scope="session")
def nn_tokenizer():
return get_lang_class("nn")().tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def pl_tokenizer(): def pl_tokenizer():
return get_lang_class("pl")().tokenizer return get_lang_class("pl")().tokenizer

View File

@ -731,3 +731,12 @@ def test_for_no_ent_sents():
sents = list(doc.ents[0].sents) sents = list(doc.ents[0].sents)
assert len(sents) == 1 assert len(sents) == 1
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY" assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
def test_span_api_richcmp_other(en_tokenizer):
doc1 = en_tokenizer("a b")
doc2 = en_tokenizer("b c")
assert not doc1[1:2] == doc1[1]
assert not doc1[1:2] == doc2[0]
assert not doc1[1:2] == doc2[0:1]
assert not doc1[0:1] == doc2

View File

@ -294,3 +294,12 @@ def test_missing_head_dep(en_vocab):
assert aligned_heads[0] == ref_heads[0] assert aligned_heads[0] == ref_heads[0]
assert aligned_deps[5] == ref_deps[5] assert aligned_deps[5] == ref_deps[5]
assert aligned_heads[5] == ref_heads[5] assert aligned_heads[5] == ref_heads[5]
def test_token_api_richcmp_other(en_tokenizer):
doc1 = en_tokenizer("a b")
doc2 = en_tokenizer("b c")
assert not doc1[1] == doc1[0:1]
assert not doc1[1] == doc2[1:2]
assert not doc1[1] == doc2[0]
assert not doc1[0] == doc2

View File

View File

@ -0,0 +1,26 @@
import pytest
# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
# fmt: off
FO_TOKEN_EXCEPTION_TESTS = [
(
"Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ",
[
"Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".",
],
),
(
"Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.",
[
"Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".",
],
),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS)
def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens):
tokens = fo_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

View File

@ -0,0 +1,38 @@
import pytest
# examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
# fmt: off
NN_TOKEN_EXCEPTION_TESTS = [
(
"Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.",
[
"Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "", "raskt", "som", "mogleg", "i", "2014", ".",
],
),
(
"Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.",
[
"Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "", "skulen", "ueinig", "i", ".",
],
),
(
"Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.",
[
"Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".",
],
),
(
"Brukssesongen er frå nov. til mai, med ein topp i mars.",
[
"Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".",
],
),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS)
def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens):
tokens = nn_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -203,7 +203,7 @@ def test_pipe_class_component_model():
"@architectures": "spacy.TextCatEnsemble.v2", "@architectures": "spacy.TextCatEnsemble.v2",
"tok2vec": DEFAULT_TOK2VEC_MODEL, "tok2vec": DEFAULT_TOK2VEC_MODEL,
"linear_model": { "linear_model": {
"@architectures": "spacy.TextCatBOW.v2", "@architectures": "spacy.TextCatBOW.v3",
"exclusive_classes": False, "exclusive_classes": False,
"ngram_size": 1, "ngram_size": 1,
"no_output_layer": False, "no_output_layer": False,

View File

@ -414,7 +414,7 @@ def test_implicit_label(name, get_examples):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW V1
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
@ -451,14 +451,14 @@ def test_no_resize(name, textcat_config):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW V3
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN # CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
], ],
) )
# fmt: on # fmt: on
@ -480,14 +480,14 @@ def test_resize(name, textcat_config):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,textcat_config", "name,textcat_config",
[ [
# BOW # BOW v3
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": False, "ngram_size": 3}),
("textcat", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}), ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}), ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN # REDUCE
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
], ],
) )
# fmt: on # fmt: on
@ -693,12 +693,23 @@ def test_overfitting_IO_multi():
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# BOW V3
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}),
# ENSEMBLE V2 # ENSEMBLE V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v2", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
# CNN V2 # CNN V2 (legacy)
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}), ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}), ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
# PARAMETRIC ATTENTION V1
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
# REDUCE V1
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
], ],
) )
# fmt: on # fmt: on

View File

@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer):
Token.set_extension("_test_token", default="t0") Token.set_extension("_test_token", default="t0")
doc[1]._._test_token = "t1" doc[1]._._test_token = "t1"
return doc yield doc
Doc.remove_extension("_test_attr")
Doc.remove_extension("_test_prop")
Doc.remove_extension("_test_method")
Token.remove_extension("_test_token")
def test_serialize_ext_attrs_from_bytes(doc_w_attrs): def test_serialize_ext_attrs_from_bytes(doc_w_attrs):

View File

@ -1061,3 +1061,8 @@ def test_debug_data_trainable_lemmatizer_not_annotated():
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
assert data["no_lemma_annotations"] == 2 assert data["no_lemma_annotations"] == 2
def test_project_api_imports():
from spacy.cli import project_run
from spacy.cli.project.run import project_run # noqa: F401, F811

View File

@ -214,9 +214,6 @@ def test_project_clone(options):
assert (out / "README.md").is_file() assert (out / "README.md").is_file()
@pytest.mark.skipif(
sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes"
)
def test_project_push_pull(project_dir): def test_project_push_pull(project_dir):
proj = dict(SAMPLE_PROJECT) proj = dict(SAMPLE_PROJECT)
remote = "xyz" remote = "xyz"
@ -241,7 +238,7 @@ def test_project_push_pull(project_dir):
def test_find_function_valid(): def test_find_function_valid():
# example of architecture in main code base # example of architecture in main code base
function = "spacy.TextCatBOW.v2" function = "spacy.TextCatBOW.v3"
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"]) result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
assert f"Found registered function '{function}'" in result.stdout assert f"Found registered function '{function}'" in result.stdout
assert "textcat.py" in result.stdout assert "textcat.py" in result.stdout
@ -260,7 +257,7 @@ def test_find_function_valid():
def test_find_function_invalid(): def test_find_function_invalid():
# invalid registry # invalid registry
function = "spacy.TextCatBOW.v2" function = "spacy.TextCatBOW.v3"
registry = "foobar" registry = "foobar"
result = CliRunner().invoke( result = CliRunner().invoke(
app, ["find-function", function, "--registry", registry] app, ["find-function", function, "--registry", registry]

View File

@ -2,7 +2,7 @@ import numpy
import pytest import pytest
from spacy import displacy from spacy import displacy
from spacy.displacy.render import DependencyRenderer, EntityRenderer from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.fa import Persian from spacy.lang.fa import Persian
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
@ -468,3 +468,23 @@ def test_issue12816(en_vocab) -> None:
# Verify that the HTML tag is still escaped # Verify that the HTML tag is still escaped
html = displacy.render(doc, style="span") html = displacy.render(doc, style="span")
assert "&lt;TEST&gt;" in html assert "&lt;TEST&gt;" in html
@pytest.mark.issue(13056)
def test_displacy_span_stacking():
"""Test whether span stacking works properly for multiple overlapping spans."""
spans = [
{"start_token": 2, "end_token": 5, "label": "SkillNC"},
{"start_token": 0, "end_token": 2, "label": "Skill"},
{"start_token": 1, "end_token": 3, "label": "Skill"},
]
tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."]
per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens)
assert len(per_token_info) == len(tokens)
assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)])
assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)])
assert per_token_info[1]["entities"][0]["render_slot"] == 1
assert per_token_info[1]["entities"][1]["render_slot"] == 2
assert per_token_info[2]["entities"][0]["render_slot"] == 2
assert per_token_info[2]["entities"][1]["render_slot"] == 3

View File

@ -376,8 +376,9 @@ def test_util_dot_section():
factory = "textcat" factory = "textcat"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
""" """
@ -485,8 +486,8 @@ def test_to_ternary_int():
def test_find_available_port(): def test_find_available_port():
host = "0.0.0.0" host = "0.0.0.0"
port = 5000 port = 5001
assert find_available_port(port, host) == port, "Port 5000 isn't free" assert find_available_port(port, host) == port, "Port 5001 isn't free"
from wsgiref.simple_server import demo_app, make_server from wsgiref.simple_server import demo_app, make_server

View File

@ -26,6 +26,7 @@ from spacy.ml.models import (
build_Tok2Vec_model, build_Tok2Vec_model,
) )
from spacy.ml.staticvectors import StaticVectors from spacy.ml.staticvectors import StaticVectors
from spacy.util import registry
def get_textcat_bow_kwargs(): def get_textcat_bow_kwargs():
@ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5):
Y, backprop = model((docs, spans), is_train=True) Y, backprop = model((docs, spans), is_train=True)
assert Y.shape == (spans.dataXd.shape[0], nO) assert Y.shape == (spans.dataXd.shape[0], nO)
backprop(Y) backprop(Y)
def test_textcat_reduce_invalid_args():
textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1")
tok2vec = make_test_tok2vec()
with pytest.raises(ValueError, match=r"must be used with at least one reduction"):
textcat_reduce(
tok2vec=tok2vec,
exclusive_classes=False,
use_reduce_first=False,
use_reduce_last=False,
use_reduce_max=False,
use_reduce_mean=False,
)

View File

@ -85,6 +85,18 @@ def test_tokenizer_explain_special_matcher(en_vocab):
assert tokens == explain_tokens assert tokens == explain_tokens
def test_tokenizer_explain_special_matcher_whitespace(en_vocab):
rules = {":]": [{"ORTH": ":]"}]}
tokenizer = Tokenizer(
en_vocab,
rules=rules,
)
text = ": ]"
tokens = [t.text for t in tokenizer(text)]
explain_tokens = [t[1] for t in tokenizer.explain(text)]
assert tokens == explain_tokens
@hypothesis.strategies.composite @hypothesis.strategies.composite
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
""" """
@ -123,6 +135,9 @@ def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
""" """
tokenizer: Tokenizer = spacy.blank(lang).tokenizer tokenizer: Tokenizer = spacy.blank(lang).tokenizer
tokens = [t.text for t in tokenizer(sentence) if not t.is_space] # Tokenizer.explain is not intended to handle whitespace or control
# characters in the same way as Tokenizer
sentence = re.sub(r"\s+", " ", sentence).strip()
tokens = [t.text for t in tokenizer(sentence)]
debug_tokens = [t[1] for t in tokenizer.explain(sentence)] debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"

View File

@ -730,9 +730,16 @@ cdef class Tokenizer:
if i in spans_by_start: if i in spans_by_start:
span = spans_by_start[i] span = spans_by_start[i]
exc = [d[ORTH] for d in special_cases[span.label_]] exc = [d[ORTH] for d in special_cases[span.label_]]
for j, orth in enumerate(exc): # The phrase matcher can overmatch for tokens separated by
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth])) # spaces in the text but not in the underlying rule, so skip
i += len(span) # cases where the texts aren't identical
if span.text != "".join([self.vocab.strings[orth] for orth in exc]):
final_tokens.append(tokens[i])
i += 1
else:
for j, orth in enumerate(exc):
final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
i += len(span)
else: else:
final_tokens.append(tokens[i]) final_tokens.append(tokens[i])
i += 1 i += 1

View File

@ -5,4 +5,4 @@ from .span import Span
from .span_group import SpanGroup from .span_group import SpanGroup
from .token import Token from .token import Token
__all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"] __all__ = ["Doc", "DocBin", "MorphAnalysis", "Span", "SpanGroup", "Token"]

View File

@ -42,7 +42,7 @@ class Doc:
user_hooks: Dict[str, Callable[..., Any]] user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]]
tensor: np.ndarray[Any, np.dtype[np.float_]] tensor: np.ndarray[Any, np.dtype[np.float64]]
user_data: Dict[str, Any] user_data: Dict[str, Any]
has_unknown_spaces: bool has_unknown_spaces: bool
_context: Any _context: Any
@ -125,7 +125,7 @@ class Doc:
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
alignment_mode: str = ..., alignment_mode: str = ...,
span_id: Union[int, str] = ..., span_id: Union[int, str] = ...,
) -> Span: ... ) -> Optional[Span]: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property @property
def has_vector(self) -> bool: ... def has_vector(self) -> bool: ...
@ -166,7 +166,7 @@ class Doc:
) -> Doc: ... ) -> Doc: ...
def to_array( def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]] self, py_attr_ids: Union[int, str, List[Union[int, str]]]
) -> np.ndarray[Any, np.dtype[np.float_]]: ... ) -> np.ndarray[Any, np.dtype[np.float64]]: ...
@staticmethod @staticmethod
def from_docs( def from_docs(
docs: List[Doc], docs: List[Doc],
@ -179,15 +179,13 @@ class Doc:
self, path: Union[str, Path], *, exclude: Iterable[str] = ... self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> None: ... ) -> None: ...
def from_disk( def from_disk(
self, path: Union[str, Path], *, exclude: Union[List[str], Tuple[str]] = ... self, path: Union[str, Path], *, exclude: Iterable[str] = ...
) -> Doc: ... ) -> Doc: ...
def to_bytes(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ... def to_bytes(self, *, exclude: Iterable[str] = ...) -> bytes: ...
def from_bytes( def from_bytes(self, bytes_data: bytes, *, exclude: Iterable[str] = ...) -> Doc: ...
self, bytes_data: bytes, *, exclude: Union[List[str], Tuple[str]] = ... def to_dict(self, *, exclude: Iterable[str] = ...) -> Dict[str, Any]: ...
) -> Doc: ...
def to_dict(self, *, exclude: Union[List[str], Tuple[str]] = ...) -> bytes: ...
def from_dict( def from_dict(
self, msg: bytes, *, exclude: Union[List[str], Tuple[str]] = ... self, msg: Dict[str, Any], *, exclude: Iterable[str] = ...
) -> Doc: ... ) -> Doc: ...
def extend_tensor(self, tensor: Floats2d) -> None: ... def extend_tensor(self, tensor: Floats2d) -> None: ...
def retokenize(self) -> Retokenizer: ... def retokenize(self) -> Retokenizer: ...

View File

@ -1326,7 +1326,7 @@ cdef class Doc:
path (str / Path): A path to a directory. Paths may be either path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects. strings or `Path`-like objects.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Doc): The modified `Doc` object. RETURNS (Doc): The modified `Doc` object.
DOCS: https://spacy.io/api/doc#from_disk DOCS: https://spacy.io/api/doc#from_disk
@ -1339,7 +1339,7 @@ cdef class Doc:
def to_bytes(self, *, exclude=tuple()): def to_bytes(self, *, exclude=tuple()):
"""Serialize, i.e. export the document contents to a binary string. """Serialize, i.e. export the document contents to a binary string.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations. all annotations.
@ -1351,7 +1351,7 @@ cdef class Doc:
"""Deserialize, i.e. import the document contents from a binary string. """Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from. data (bytes): The string to load from.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Doc): Itself. RETURNS (Doc): Itself.
DOCS: https://spacy.io/api/doc#from_bytes DOCS: https://spacy.io/api/doc#from_bytes
@ -1361,11 +1361,8 @@ cdef class Doc:
def to_dict(self, *, exclude=tuple()): def to_dict(self, *, exclude=tuple()):
"""Export the document contents to a dictionary for serialization. """Export the document contents to a dictionary for serialization.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including RETURNS (Dict[str, Any]): A dictionary representation of the `Doc`
all annotations.
DOCS: https://spacy.io/api/doc#to_bytes
""" """
array_head = Doc._get_array_attrs() array_head = Doc._get_array_attrs()
strings = set() strings = set()
@ -1411,13 +1408,11 @@ cdef class Doc:
return util.to_dict(serializers, exclude) return util.to_dict(serializers, exclude)
def from_dict(self, msg, *, exclude=tuple()): def from_dict(self, msg, *, exclude=tuple()):
"""Deserialize, i.e. import the document contents from a binary string. """Deserialize the document contents from a dictionary representation.
data (bytes): The string to load from. msg (Dict[str, Any]): The dictionary to load from.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Doc): Itself. RETURNS (Doc): Itself.
DOCS: https://spacy.io/api/doc#from_dict
""" """
if self.length != 0: if self.length != 0:
raise ValueError(Errors.E033.format(length=self.length)) raise ValueError(Errors.E033.format(length=self.length))

View File

@ -127,14 +127,17 @@ cdef class Span:
self._vector = vector self._vector = vector
self._vector_norm = vector_norm self._vector_norm = vector_norm
def __richcmp__(self, Span other, int op): def __richcmp__(self, object other, int op):
if other is None: if other is None:
if op == 0 or op == 1 or op == 2: if op == 0 or op == 1 or op == 2:
return False return False
else: else:
return True return True
if not isinstance(other, Span):
return False
cdef Span other_span = other
self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc) self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc)
other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc) other_tuple = (other_span.c.start_char, other_span.c.end_char, other_span.c.label, other_span.c.kb_id, other_span.id, other_span.doc)
# < # <
if op == 0: if op == 0:
return self_tuple < other_tuple return self_tuple < other_tuple

View File

@ -53,7 +53,12 @@ class Token:
def __bytes__(self) -> bytes: ... def __bytes__(self) -> bytes: ...
def __str__(self) -> str: ... def __str__(self) -> str: ...
def __repr__(self) -> str: ... def __repr__(self) -> str: ...
def __richcmp__(self, other: Token, op: int) -> bool: ... def __lt__(self, other: Any) -> bool: ...
def __le__(self, other: Any) -> bool: ...
def __eq__(self, other: Any) -> bool: ...
def __ne__(self, other: Any) -> bool: ...
def __gt__(self, other: Any) -> bool: ...
def __ge__(self, other: Any) -> bool: ...
@property @property
def _(self) -> Underscore: ... def _(self) -> Underscore: ...
def nbor(self, i: int = ...) -> Token: ... def nbor(self, i: int = ...) -> Token: ...

View File

@ -139,17 +139,20 @@ cdef class Token:
def __repr__(self): def __repr__(self):
return self.__str__() return self.__str__()
def __richcmp__(self, Token other, int op): def __richcmp__(self, object other, int op):
# http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
if other is None: if other is None:
if op in (0, 1, 2): if op in (0, 1, 2):
return False return False
else: else:
return True return True
if not isinstance(other, Token):
return False
cdef Token other_token = other
cdef Doc my_doc = self.doc cdef Doc my_doc = self.doc
cdef Doc other_doc = other.doc cdef Doc other_doc = other_token.doc
my = self.idx my = self.idx
their = other.idx their = other_token.idx
if op == 0: if op == 0:
return my < their return my < their
elif op == 2: elif op == 2:

View File

@ -16,3 +16,28 @@ from .iob_utils import ( # noqa: F401
tags_to_entities, tags_to_entities,
) )
from .loggers import console_logger # noqa: F401 from .loggers import console_logger # noqa: F401
__all__ = [
"Alignment",
"Corpus",
"Example",
"JsonlCorpus",
"PlainTextCorpus",
"biluo_tags_to_offsets",
"biluo_tags_to_spans",
"biluo_to_iob",
"create_copy_from_base_model",
"docs_to_json",
"dont_augment",
"iob_to_biluo",
"minibatch_by_padded_size",
"minibatch_by_words",
"offsets_to_biluo_tags",
"orth_variants_augmenter",
"read_json_file",
"remove_bilu_prefix",
"split_bilu_label",
"tags_to_entities",
"validate_get_examples",
"validate_examples",
]

View File

@ -1077,20 +1077,38 @@ def make_tempdir() -> Generator[Path, None, None]:
def is_in_jupyter() -> bool: def is_in_jupyter() -> bool:
"""Check if user is running spaCy from a Jupyter notebook by detecting the """Check if user is running spaCy from a Jupyter or Colab notebook by
IPython kernel. Mainly used for the displaCy visualizer. detecting the IPython kernel. Mainly used for the displaCy visualizer.
RETURNS (bool): True if in Jupyter, False if not. RETURNS (bool): True if in Jupyter/Colab, False if not.
""" """
# https://stackoverflow.com/a/39662359/6400719 # https://stackoverflow.com/a/39662359/6400719
# https://stackoverflow.com/questions/15411967
try: try:
shell = get_ipython().__class__.__name__ # type: ignore[name-defined] if get_ipython().__class__.__name__ == "ZMQInteractiveShell": # type: ignore[name-defined]
if shell == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole return True # Jupyter notebook or qtconsole
if get_ipython().__class__.__module__ == "google.colab._shell": # type: ignore[name-defined]
return True # Colab notebook
except NameError: except NameError:
return False # Probably standard Python interpreter pass # Probably standard Python interpreter
# additional check for Colab
try:
import google.colab
return True # Colab notebook
except ImportError:
pass
return False return False
def is_in_interactive() -> bool:
"""Check if user is running spaCy from an interactive Python
shell. Will return True in Jupyter notebooks too.
RETURNS (bool): True if in interactive mode, False if not.
"""
# https://stackoverflow.com/questions/2356399/tell-if-python-is-in-interactive-mode
return hasattr(sys, "ps1") or hasattr(sys, "ps2")
def get_object_name(obj: Any) -> str: def get_object_name(obj: Any) -> str:
"""Get a human-readable name of a Python object, e.g. a pipeline component. """Get a human-readable name of a Python object, e.g. a pipeline component.

View File

@ -78,16 +78,16 @@ subword features, and a
[MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder) encoding layer
consisting of a CNN and a layer-normalized maxout activation function. consisting of a CNN and a layer-normalized maxout activation function.
| Name | Description | | Name | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ | | `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ | | `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ | | `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | | `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * window_size * 2 + 1`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ | | `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ | | `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.Tok2VecListener.v1 {id="Tok2VecListener"} ### spacy.Tok2VecListener.v1 {id="Tok2VecListener"}
@ -962,8 +962,9 @@ single-label use-cases where `exclusive_classes = true`, while the
> nO = null > nO = null
> >
> [model.linear_model] > [model.linear_model]
> @architectures = "spacy.TextCatBOW.v2" > @architectures = "spacy.TextCatBOW.v3"
> exclusive_classes = true > exclusive_classes = true
> length = 262144
> ngram_size = 1 > ngram_size = 1
> no_output_layer = false > no_output_layer = false
> >
@ -1017,54 +1018,15 @@ but used an internal `tok2vec` instead of taking it as argument:
</Accordion> </Accordion>
### spacy.TextCatCNN.v2 {id="TextCatCNN"} ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TextCatCNN.v2" > @architectures = "spacy.TextCatBOW.v3"
> exclusive_classes = false
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```
A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
not yet resizable. Since v2, new labels can be added to this component, even
after training.
</Accordion>
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatBOW.v2"
> exclusive_classes = false > exclusive_classes = false
> length = 262144
> ngram_size = 1 > ngram_size = 1
> no_output_layer = false > no_output_layer = false
> nO = null > nO = null
@ -1078,17 +1040,108 @@ the others, but may not be as accurate, especially if texts are short.
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `length` | The size of the weights vector. The length will be rounded up to the next power of two if it is not a power of two. Defaults to `262144`. ~~int~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatBOW.v1 definition" spaced> <Accordion title="Previous versions of spacy.TextCatBOW" spaced>
[TextCatBOW.v1](/api/legacy#TextCatBOW_v1) had the exact same signature, but was - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) was not yet resizable. Since v2,
not yet resizable. Since v2, new labels can be added to this component, even new labels can be added to this component, even after training.
after training. - [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) used an erroneous sparse linear
layer that only used a small number of the allocated parameters.
- [TextCatBOW.v1](/api/legacy#TextCatBOW_v1) and
[TextCatBOW.v2](/api/legacy#TextCatBOW_v2) did not have the `length` argument.
</Accordion> </Accordion>
### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatParametricAttention.v1"
> exclusive_classes = true
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.Tok2Vec.v2"
>
> [model.tok2vec.embed]
> @architectures = "spacy.MultiHashEmbed.v2"
> width = 64
> rows = [2000, 2000, 1000, 1000, 1000, 1000]
> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
> include_static_vectors = false
>
> [model.tok2vec.encode]
> @architectures = "spacy.MaxoutWindowEncoder.v2"
> width = ${model.tok2vec.embed.width}
> window_size = 1
> maxout_pieces = 3
> depth = 2
> ```
A neural network model that is built upon Tok2Vec and uses parametric attention
to attend to tokens that are relevant to text classification.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatReduce.v1 {id="TextCatReduce"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatReduce.v1"
> exclusive_classes = false
> use_reduce_first = false
> use_reduce_last = false
> use_reduce_max = false
> use_reduce_mean = true
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```
A classifier that pools token hidden representations of each `Doc` using first,
max or mean reduction and then applies a classification layer. Reductions are
concatenated when multiple reductions are used.
<Infobox variant="warning" title="Relation to TextCatCNN" id="TextCatCNN">
`TextCatReduce` is a generalization of the older
[`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean
reduction, whereas `TextCatReduce` also supports first/max reductions.
</Infobox>
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `use_reduce_first` | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~ |
| `use_reduce_last` | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~ |
| `use_reduce_max` | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~ |
| `use_reduce_mean` | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"} ## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}
### spacy.SpanCategorizer.v1 {id="SpanCategorizer"} ### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}

View File

@ -1268,20 +1268,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
warmed up before any measurements are taken. warmed up before any measurements are taken.
```cli ```cli
$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] $ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
``` ```
| Name | Description | | Name | Description |
| -------------------- | -------------------------------------------------------------------------------------------------------- | | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | | `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ | | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ | | `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ | | `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
## apply {id="apply", version="3.5", tag="command"} ## apply {id="apply", version="3.5", tag="command"}
@ -1296,6 +1297,9 @@ input formats are:
When a directory is provided it is traversed recursively to collect all files. When a directory is provided it is traversed recursively to collect all files.
When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
```bash ```bash
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
``` ```

View File

@ -400,6 +400,14 @@ identifiers are grouped by token. Instances of this class are typically assigned
to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
attribute. attribute.
> #### Example
>
> ```python
> # Get the last hidden layer output for "is" (token index 1)
> doc = nlp("This is a text.")
> tensors = doc._.trf_data.last_hidden_layer_state[1]
> ```
| Name | Description | | Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ | | `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |

View File

@ -20,10 +20,9 @@ An LLM component is implemented through the `LLMWrapper` class. It is accessible
through a generic `llm` through a generic `llm`
[component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories) [component factory](https://spacy.io/usage/processing-pipelines#custom-components-factories)
as well as through task-specific component factories: `llm_ner`, `llm_spancat`, as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
`llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization` and `llm_rel`, `llm_textcat`, `llm_sentiment`, `llm_summarization`,
`llm_entity_linker`. `llm_entity_linker`, `llm_raw` and `llm_translation`. For these factories, the
GPT-3-5 model from OpenAI is used by default, but this can be customized.
### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
> #### Example > #### Example
> >
@ -33,13 +32,18 @@ as well as through task-specific component factories: `llm_ner`, `llm_spancat`,
> llm = nlp.add_pipe("llm", config=config) > llm = nlp.add_pipe("llm", config=config)
> >
> # Construction via add_pipe with a task-specific factory and default GPT3.5 model > # Construction via add_pipe with a task-specific factory and default GPT3.5 model
> llm = nlp.add_pipe("llm-ner") > llm = nlp.add_pipe("llm_ner")
>
> # Construction via add_pipe with a task-specific factory and custom model
> llm = nlp.add_pipe("llm_ner", config={"model": {"@llm_models": "spacy.Dolly.v1", "name": "dolly-v2-12b"}})
> >
> # Construction from class > # Construction from class
> from spacy_llm.pipeline import LLMWrapper > from spacy_llm.pipeline import LLMWrapper
> llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True) > llm = LLMWrapper(vocab=nlp.vocab, task=task, model=model, cache=cache, save_io=True)
> ``` > ```
### LLMWrapper.\_\_init\_\_ {id="init",tag="method"}
Create a new pipeline instance. In your application, you would normally use a Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
@ -225,8 +229,8 @@ All tasks are registered in the `llm_tasks` registry.
dataset across multiple storage units for easier processing and lookups. In dataset across multiple storage units for easier processing and lookups. In
`spacy-llm` we use this term (synonymously: "mapping") to describe the splitting `spacy-llm` we use this term (synonymously: "mapping") to describe the splitting
up of prompts if they are too long for a model to handle, and "fusing" up of prompts if they are too long for a model to handle, and "fusing"
(synonymously: "reducing") to describe how the model responses for several shards (synonymously: "reducing") to describe how the model responses for several
are merged back together into a single document. shards are merged back together into a single document.
Prompts are broken up in a manner that _always_ keeps the prompt in the template Prompts are broken up in a manner that _always_ keeps the prompt in the template
intact, meaning that the instructions to the LLM will always stay complete. The intact, meaning that the instructions to the LLM will always stay complete. The
@ -1133,6 +1137,25 @@ supports `.yml`, `.yaml`, `.json` and `.jsonl`.
path = "textcat_examples.json" path = "textcat_examples.json"
``` ```
If you want to perform few-shot learning with a binary classifier (i. e. a text
either should or should not be assigned to a given class), you can provide
positive and negative examples with answers of "POS" or "NEG". "POS" means that
this example should be assigned the class label defined in the configuration,
"NEG" means it shouldn't. E. g. for spam classification:
```json
[
{
"text": "You won the lottery! Wire a fee of 200$ to be able to withdraw your winnings.",
"answer": "POS"
},
{
"text": "Your order #123456789 has arrived",
"answer": "NEG"
}
]
```
### REL {id="rel"} ### REL {id="rel"}
The REL task extracts relations between named entities. The REL task extracts relations between named entities.
@ -1484,7 +1507,7 @@ These models all take the same parameters:
> ```ini > ```ini
> [components.llm.model] > [components.llm.model]
> @llm_models = "spacy.Llama2.v1" > @llm_models = "spacy.Llama2.v1"
> name = "llama2-7b-hf" > name = "Llama-2-7b-hf"
> ``` > ```
Currently, these models are provided as part of the core library: Currently, these models are provided as part of the core library:

View File

@ -162,7 +162,10 @@ network has an internal CNN Tok2Vec layer and uses attention.
Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means Since `spacy.TextCatCNN.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not that you can add labels to a previously trained textcat. `TextCatCNN` v1 did not
yet support that. yet support that. `TextCatCNN` has been replaced by the more general
[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
identical to `TextCatReduce` with `use_reduce_mean=true`,
`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
> #### Example Config > #### Example Config
> >
@ -194,11 +197,58 @@ architecture is usually less accurate than the ensemble, but runs faster.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatCNN.v2 {id="TextCatCNN_v2"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatCNN.v2"
> exclusive_classes = false
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```
A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
`TextCatCNN` has been replaced by the more general
[`TextCatReduce`](/api/architectures#TextCatReduce) layer. `TextCatCNN` is
identical to `TextCatReduce` with `use_reduce_mean=true`,
`use_reduce_first=false`, `reduce_last=false` and `use_reduce_max=false`.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
not yet resizable. Since v2, new labels can be added to this component, even
after training.
</Accordion>
### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"} ### spacy.TextCatBOW.v1 {id="TextCatBOW_v1"}
Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means Since `spacy.TextCatBOW.v2`, this architecture has become resizable, which means
that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not that you can add labels to a previously trained textcat. `TextCatBOW` v1 did not
yet support that. yet support that. Versions of this model before `spacy.TextCatBOW.v3` used an
erroneous sparse linear layer that only used a small number of the allocated
parameters.
> #### Example Config > #### Example Config
> >
@ -222,6 +272,33 @@ the others, but may not be as accurate, especially if texts are short.
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatBOW.v2 {id="TextCatBOW"}
Versions of this model before `spacy.TextCatBOW.v3` used an erroneous sparse
linear layer that only used a small number of the allocated parameters.
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatBOW.v2"
> exclusive_classes = false
> ngram_size = 1
> no_output_layer = false
> nO = null
> ```
An n-gram "bag-of-words" model. This architecture should run much faster than
the others, but may not be as accurate, especially if texts are short.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"}
Identical to Identical to

View File

@ -89,6 +89,21 @@ architectures and their arguments and hyperparameters.
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ | | `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ | | `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
<Infobox variant="warning">
If you set a non-default value for `spans_key`, you'll have to update
`[training.score_weights]` as well so that weights are computed properly. E. g.
for `spans_key == "myspankey"`, include this in your config:
```ini
[training.score_weights]
spans_myspankey_f = 1.0
spans_myspankey_p = 0.0
spans_myspankey_r = 0.0
```
</Infobox>
```python ```python
%%GITHUB_SPACY/spacy/pipeline/spancat.py %%GITHUB_SPACY/spacy/pipeline/spancat.py
``` ```

View File

@ -397,6 +397,17 @@ are wrapped into the
by this class. Instances of this class are typically assigned to the by this class. Instances of this class are typically assigned to the
[`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute. [`Doc._.trf_data`](/api/transformer#assigned-attributes) extension attribute.
> #### Example
>
> ```python
> # Get the last hidden layer output for "is" (token index 1)
> doc = nlp("This is a text.")
> indices = doc._.trf_data.align[1].data.flatten()
> last_hidden_state = doc._.trf_data.model_output.last_hidden_state
> dim = last_hidden_state.shape[-1]
> tensors = last_hidden_state.reshape(-1, dim)[indices]
> ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | | `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |

View File

@ -13,7 +13,7 @@ between `Doc` objects.
<Infobox variant ="warning"> <Infobox variant ="warning">
Note that a `Vocab` instance is not static. It increases in size as texts with Note that a `Vocab` instance is not static. It increases in size as texts with
new tokens are processed. new tokens are processed. Some models may have an empty vocab at initialization.
</Infobox> </Infobox>
@ -93,6 +93,7 @@ given string, you need to look it up in
> #### Example > #### Example
> >
> ```python > ```python
> nlp("I'm eating an apple")
> apple = nlp.vocab.strings["apple"] > apple = nlp.vocab.strings["apple"]
> oov = nlp.vocab.strings["dskfodkfos"] > oov = nlp.vocab.strings["dskfodkfos"]
> assert apple in nlp.vocab > assert apple in nlp.vocab

View File

Before

Width:  |  Height:  |  Size: 6.8 KiB

After

Width:  |  Height:  |  Size: 6.8 KiB

View File

@ -108,12 +108,12 @@ In the `sm`/`md`/`lg` models:
#### CNN/CPU pipelines with floret vectors #### CNN/CPU pipelines with floret vectors
The Finnish, Korean and Swedish `md` and `lg` pipelines use The Croatian, Finnish, Korean, Slovenian, Swedish and Ukrainian `md` and `lg`
[floret vectors](/usage/v3-2#vectors) instead of default vectors. If you're pipelines use [floret vectors](/usage/v3-2#vectors) instead of default vectors.
running a trained pipeline on texts and working with [`Doc`](/api/doc) objects, If you're running a trained pipeline on texts and working with [`Doc`](/api/doc)
you shouldn't notice any difference with floret vectors. With floret vectors no objects, you shouldn't notice any difference with floret vectors. With floret
tokens are out-of-vocabulary, so [`Token.is_oov`](/api/token#attributes) will vectors no tokens are out-of-vocabulary, so
return `False` for all tokens. [`Token.is_oov`](/api/token#attributes) will return `False` for all tokens.
If you access vectors directly for similarity comparisons, there are a few If you access vectors directly for similarity comparisons, there are a few
differences because floret vectors don't include a fixed word list like the differences because floret vectors don't include a fixed word list like the
@ -132,10 +132,20 @@ vector keys for default vectors.
### Transformer pipeline design {id="design-trf"} ### Transformer pipeline design {id="design-trf"}
In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present) In the transformer (`trf`) pipelines, the `tagger`, `parser` and `ner` (if
all listen to the `transformer` component. The `attribute_ruler` and present) all listen to the `transformer` component. The `attribute_ruler` and
`lemmatizer` have the same configuration as in the CNN models. `lemmatizer` have the same configuration as in the CNN models.
For spaCy v3.0-v3.6, `trf` pipelines use
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) and the
transformer output in `doc._.trf_data` is a
[`TransformerData`](/api/transformer#transformerdata) object.
For spaCy v3.7+, `trf` pipelines use
[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers)
and `doc._.trf_data` is a
[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object.
### Modifying the default pipeline {id="design-modify"} ### Modifying the default pipeline {id="design-modify"}
For faster processing, you may only want to run a subset of the components in a For faster processing, you may only want to run a subset of the components in a

View File

@ -31,8 +31,6 @@ for ent in doc.ents:
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
our example sentence and its named entities look like: our example sentence and its named entities look like:
<Iframe <Standalone height={120}>
title="displaCy visualization of entities" <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}><mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is looking at buying <mark style={{ background: '#feca74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>U.K. <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>GPE</span></mark> startup for <mark style={{ background: '#e4e7d2', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>$1 billion <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>MONEY</span></mark></div>
src="/images/displacy-ent1.html" </Standalone>
height={100}
/>

View File

@ -56,8 +56,7 @@ for token in doc:
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what
our example sentence and its dependencies look like: our example sentence and its dependencies look like:
<Iframe <ImageScrollable
title="displaCy visualization of dependencies and entities" src="/images/displacy-long.svg"
src="/images/displacy-long.html" width={1975}
height={450}
/> />

View File

@ -153,8 +153,9 @@ maxout_pieces = 3
depth = 2 depth = 2
[components.textcat.model.linear_model] [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
``` ```
@ -170,8 +171,9 @@ factory = "textcat"
labels = [] labels = []
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false
nO = null nO = null

View File

@ -290,11 +290,7 @@ for token in doc:
| toward | `prep` | shift | `NOUN` | manufacturers | | toward | `prep` | shift | `NOUN` | manufacturers |
| manufacturers | `pobj` | toward | `ADP` | | | manufacturers | `pobj` | toward | `ADP` | |
<Iframe <ImageScrollable src="/images/displacy-long2.svg" width={1275} />
title="displaCy visualization of dependencies and entities 2"
src="/images/displacy-long2.html"
height={450}
/>
Because the syntactic relations form a tree, every word has **exactly one Because the syntactic relations form a tree, every word has **exactly one
head**. You can therefore iterate over the arcs in the tree by iterating over head**. You can therefore iterate over the arcs in the tree by iterating over
@ -709,11 +705,9 @@ doc = nlp(text)
displacy.serve(doc, style="ent") displacy.serve(doc, style="ent")
``` ```
<Iframe <Standalone height={180}>
title="displaCy visualizer for entities" <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div>
src="/images/displacy-ent2.html" </Standalone>
height={180}
/>
## Entity Linking {id="entity-linking"} ## Entity Linking {id="entity-linking"}
@ -723,6 +717,10 @@ identifier from a knowledge base (KB). You can create your own
[`KnowledgeBase`](/api/kb) and [train](/usage/training) a new [`KnowledgeBase`](/api/kb) and [train](/usage/training) a new
[`EntityLinker`](/api/entitylinker) using that custom knowledge base. [`EntityLinker`](/api/entitylinker) using that custom knowledge base.
As an example on how to define a KnowledgeBase and train an entity linker model,
see [`this tutorial`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson)
using [spaCy projects](/usage/projects).
### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"} ### Accessing entity identifiers {id="entity-linking-accessing",model="entity linking"}
The annotated KB identifier is accessible as either a hash value or as a string, The annotated KB identifier is accessible as either a hash value or as a string,
@ -733,6 +731,7 @@ object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
```python ```python
import spacy import spacy
# "my_custom_el_pipeline" is assumed to be a custom NLP pipeline that was trained and serialized to disk
nlp = spacy.load("my_custom_el_pipeline") nlp = spacy.load("my_custom_el_pipeline")
doc = nlp("Ada Lovelace was born in London") doc = nlp("Ada Lovelace was born in London")

View File

@ -1328,8 +1328,9 @@ labels = []
# This function is created and then passed to the "textcat" component as # This function is created and then passed to the "textcat" component as
# the argument "model" # the argument "model"
[components.textcat.model] [components.textcat.model]
@architectures = "spacy.TextCatBOW.v2" @architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true exclusive_classes = true
length = 262144
ngram_size = 1 ngram_size = 1
no_output_layer = false no_output_layer = false

View File

@ -1144,10 +1144,9 @@ relations and tokens we want to match:
> displacy.serve(doc) > displacy.serve(doc)
> ``` > ```
<Iframe <ImageScrollable
title="displaCy visualization of dependencies" src="/images/displacy-dep-founded.svg"
src="/images/displacy-dep-founded.html" width={925}
height={450}
/> />
The relations we're interested in are: The relations we're interested in are:

View File

@ -405,7 +405,7 @@ available to spaCy, all you need to do is install the package in your
environment: environment:
```bash ```bash
$ python setup.py develop $ python -m pip install .
``` ```
spaCy is now able to create the pipeline component `"snek"` even though you spaCy is now able to create the pipeline component `"snek"` even though you
@ -586,11 +586,9 @@ After installing the package, the custom colors will be used when visualizing
text with `displacy`. Whenever the label `SNEK` is assigned, it will be text with `displacy`. Whenever the label `SNEK` is assigned, it will be
displayed in `#3dff74`. displayed in `#3dff74`.
<Iframe <Standalone height={100}>
title="displaCy visualization of entities" <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>🌱🌿 <mark style={{ background: '#3dff74', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>🐍 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>SNEK</span></mark> ____ 🌳🌲 ____ <mark style={{ background: '#cfc5ff', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>👨‍🌾 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>HUMAN</span></mark> 🏘️</div>
src="/images/displacy-ent-snek.html" </Standalone>
height={100}
/>
## Saving, loading and distributing trained pipelines {id="models"} ## Saving, loading and distributing trained pipelines {id="models"}
@ -675,7 +673,7 @@ $ python -m spacy package ./en_example_pipeline ./packages
``` ```
This command will create a pipeline package directory and will run This command will create a pipeline package directory and will run
`python setup.py sdist` in that directory to create a binary `.whl` file or `python -m build` in that directory to create a binary `.whl` file or
`.tar.gz` archive of your package that can be installed using `pip install`. `.tar.gz` archive of your package that can be installed using `pip install`.
Installing the binary wheel is usually more efficient. Installing the binary wheel is usually more efficient.

View File

@ -77,11 +77,9 @@ doc.spans["custom"] = [Span(doc, 3, 6, "ORG"), Span(doc, 5, 6, "GPE")]
displacy.serve(doc, style="span", options={"spans_key": "custom"}) displacy.serve(doc, style="span", options={"spans_key": "custom"})
``` ```
<Iframe <Standalone height={100}>
title="displaCy visualizer for overlapping spans" <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div>
src="/images/displacy-span.html" </Standalone>
height={180}
/>
## Additional features and improvements ## Additional features and improvements

View File

@ -119,11 +119,9 @@ doc = nlp(text)
displacy.serve(doc, style="ent") displacy.serve(doc, style="ent")
``` ```
<Iframe <Standalone height={180}>
title="displaCy visualizer for entities" <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>When <mark style={{ background: '#aa9cfc', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Sebastian Thrun <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>PERSON</span></mark> started working on self-driving cars at <mark style={{ background: '#7aecec', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> in <mark style={{ background: '#bfe1d9', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>2007 <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>DATE</span></mark>, few people outside of the company took him seriously.</div>
src="/images/displacy-ent2.html" </Standalone>
height={180}
/>
The entity visualizer lets you customize the following `options`: The entity visualizer lets you customize the following `options`:
@ -148,11 +146,9 @@ use the `colors` setting to add your own colors for them.
> displacy.serve(doc, style="ent", options=options) > displacy.serve(doc, style="ent", options=options)
> ``` > ```
<Iframe <Standalone height={225}>
title="displaCy visualizer for entities (custom styling)" <div style={{lineHeight: 2.5, fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18}}>But <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Google <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark> is starting from behind. The company made a late push into hardware, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Apple <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>s Siri, available on iPhones, and <mark style={{ background: 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', padding: '0.45em 0.6em', margin: '0 0.25em', lineHeight: 1, borderRadius: '0.35em'}}>Amazon <span style={{ fontSize: '0.8em', fontWeight: 'bold', lineHeight: 1, borderRadius: '0.35em', marginLeft: '0.5rem'}}>ORG</span></mark>s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer adoption.</div>
src="/images/displacy-ent-custom.html" </Standalone>
height={225}
/>
The above example uses a little trick: Since the background color values are The above example uses a little trick: Since the background color values are
added as the `background` style attribute, you can use any added as the `background` style attribute, you can use any
@ -197,11 +193,9 @@ doc.spans["sc"] = [
displacy.serve(doc, style="span") displacy.serve(doc, style="span")
``` ```
<Iframe <Standalone height={100}>
title="displaCy visualizer for overlapping spans" <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#7aecec', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#7aecec', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>ORG</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#7aecec', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#feca74', top: 57, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#feca74', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>GPE</span></span></span>.</div>
src="/images/displacy-span.html" </Standalone>
height={180}
/>
The span visualizer lets you customize the following `options`: The span visualizer lets you customize the following `options`:
@ -223,11 +217,9 @@ specify which one displaCy should use with `spans_key` (`sc` is the default).
> displacy.serve(doc, style="span", options=options) > displacy.serve(doc, style="span", options=options)
> ``` > ```
<Iframe <Standalone height={100}>
title="displaCy visualizer for spans (custom spans_key)" <div style={{ lineHeight: 2.5, direction: 'ltr', fontFamily: "-apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'", fontSize: 18 }}>Welcome to the <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>Bank<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span><span style={{ background: '#ddd', top: 40, height: 4, borderTopLeftRadius: 3, borderBottomLeftRadius: 3, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}><span style={{ background: '#ddd', color: '#000', top: '-0.5em', padding: '2px 3px', position: 'absolute', fontSize: '0.6em', fontWeight: 'bold', lineHeight: 1, borderRadius: 3 }}>BANK</span></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>of <span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span> <span style={{ fontWeight: 'bold', display: 'inline-block', position: 'relative'}}>China<span style={{ background: '#ddd', top: 40, height: 4, left: -1, width: 'calc(100% + 2px)', position: 'absolute' }}></span></span>.</div>
src="/images/displacy-span-custom.html" </Standalone>
height={225}
/>
## Using displaCy in Jupyter notebooks {id="jupyter"} ## Using displaCy in Jupyter notebooks {id="jupyter"}

View File

@ -103,6 +103,10 @@
"has_examples": true, "has_examples": true,
"models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"] "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"]
}, },
{
"code": "fo",
"name": "Faroese"
},
{ {
"code": "fr", "code": "fr",
"name": "French", "name": "French",
@ -290,6 +294,12 @@
"example": "Dit is een zin.", "example": "Dit is een zin.",
"has_examples": true "has_examples": true
}, },
{
"code": "nn",
"name": "Norwegian Nynorsk",
"example": "Det er ein meir enn i same periode i fjor.",
"has_examples": true
},
{ {
"code": "pl", "code": "pl",
"name": "Polish", "name": "Polish",

View File

@ -9,14 +9,9 @@
{ "text": "Models & Languages", "url": "/usage/models" }, { "text": "Models & Languages", "url": "/usage/models" },
{ "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" },
{ "text": "spaCy 101", "url": "/usage/spacy-101" }, { "text": "spaCy 101", "url": "/usage/spacy-101" },
{ "text": "New in v3.0", "url": "/usage/v3" }, { "text": "New in v3.7", "url": "/usage/v3-7" },
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" },
{ "text": "New in v3.5", "url": "/usage/v3-5" },
{ "text": "New in v3.6", "url": "/usage/v3-6" }, { "text": "New in v3.6", "url": "/usage/v3-6" },
{ "text": "New in v3.7", "url": "/usage/v3-7" } { "text": "New in v3.5", "url": "/usage/v3-5" }
] ]
}, },
{ {

View File

@ -66,6 +66,10 @@
{ {
"text": "Stack Overflow", "text": "Stack Overflow",
"url": "http://stackoverflow.com/questions/tagged/spacy" "url": "http://stackoverflow.com/questions/tagged/spacy"
},
{
"text": "Merchandise",
"url": "https://explosion.ai/merch"
} }
] ]
}, },

View File

@ -4500,6 +4500,23 @@
"website": "https://nlp.unibuc.ro/people/snisioi.html" "website": "https://nlp.unibuc.ro/people/snisioi.html"
}, },
"category": ["pipeline", "training", "models"] "category": ["pipeline", "training", "models"]
},
{
"id": "redfield-spacy-nodes",
"title": "Redfield NLP Nodes for KNIME",
"slogan": "Makes the functionality of the spaCy library available in KNIME Analytics Platform.",
"description": "This extension provides nodes that make the functionality of the spaCy library available in the [KNIME Analytics Platform](https://www.knime.com/).",
"github": "Redfield-AB/Spacy-Nodes",
"url": "https://redfield.ai/spacy-redfield/",
"thumb": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/redfield_logo_100x100.png",
"image": "https://raw.githubusercontent.com/Redfield-AB/Spacy-Nodes/master/resource/screen1.png",
"author": "Redfield AB",
"author_links": {
"twitter": "Redfield_AB",
"github": "Redfield-AB",
"website": "https://redfield.ai"
},
"category": ["standalone"]
} }
], ],

View File

Before

Width:  |  Height:  |  Size: 5.1 KiB

After

Width:  |  Height:  |  Size: 5.1 KiB

View File

@ -1,80 +0,0 @@
<div
class="entities"
style="
line-height: 2.5;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
font-size: 18px;
"
>But
<mark
class="entity"
style="
background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>Google
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>ORG</span
></mark
>is starting from behind. The company made a late push into hardware, and
<mark
class="entity"
style="
background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>Apple
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>ORG</span
></mark
>s Siri, available on iPhones, and
<mark
class="entity"
style="
background: linear-gradient(90deg, #aa9cfc, #fc9ce7);
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>Amazon
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>ORG</span
></mark
>s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer
adoption.</div
>

View File

@ -1,59 +0,0 @@
<div
class="entities"
style="
line-height: 2.5;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
font-size: 16px;
"
>
🌱🌿
<mark
class="entity"
style="
background: #3dff74;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>🐍
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>SNEK</span
></mark
>
____ 🌳🌲 ____
<mark
class="entity"
style="
background: #cfc5ff;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>👨‍🌾
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>HUMAN</span
></mark
>
🏘️
</div>

View File

@ -1,84 +0,0 @@
<div
class="entities"
style="
line-height: 2.5;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
font-size: 16px;
"
>
<mark
class="entity"
style="
background: #7aecec;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>
Apple
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>ORG</span
>
</mark>
is looking at buying
<mark
class="entity"
style="
background: #feca74;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>
U.K.
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>GPE</span
>
</mark>
startup for
<mark
class="entity"
style="
background: #e4e7d2;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>
$1 billion
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>MONEY</span
>
</mark>
</div>

View File

@ -1,86 +0,0 @@
<div
class="entities"
style="
line-height: 2.5;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
font-size: 18px;
"
>
When
<mark
class="entity"
style="
background: #aa9cfc;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>
Sebastian Thrun
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>PERSON</span
>
</mark>
started working on self-driving cars at
<mark
class="entity"
style="
background: #7aecec;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>
Google
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>ORG</span
>
</mark>
in
<mark
class="entity"
style="
background: #bfe1d9;
padding: 0.45em 0.6em;
margin: 0 0.25em;
line-height: 1;
border-radius: 0.35em;
"
>
2007
<span
style="
font-size: 0.8em;
font-weight: bold;
line-height: 1;
border-radius: 0.35em;
text-transform: uppercase;
vertical-align: middle;
margin-left: 0.5rem;
"
>DATE</span
>
</mark>
, few people outside of the company took him seriously.
</div>

View File

Before

Width:  |  Height:  |  Size: 11 KiB

After

Width:  |  Height:  |  Size: 11 KiB

View File

@ -0,0 +1,212 @@
<svg
xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
id="0"
class="displacy"
width="1275"
height="399.5"
style="
max-width: none;
height: 399.5px;
color: #000000;
background: #ffffff;
font-family: Arial;
"
>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="50">Autonomous</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADJ</tspan>
</text>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="225">cars</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">NOUN</tspan>
</text>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="400">shift</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">VERB</tspan>
</text>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="575">insurance</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan>
</text>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="750">liability</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">NOUN</tspan>
</text>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="925">toward</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">ADP</tspan>
</text>
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
<tspan class="displacy-word" fill="currentColor" x="1100">manufacturers</tspan>
<tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">NOUN</tspan>
</text>
<g class="displacy-arrow">
<path
class="displacy-arc"
id="arrow-0-0"
stroke-width="2px"
d="M70,264.5 C70,177.0 215.0,177.0 215.0,264.5"
fill="none"
stroke="currentColor"
></path>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textpath
xlink:href="#arrow-0-0"
class="displacy-label"
startOffset="50%"
fill="currentColor"
text-anchor="middle"
>
amod
</textpath>
</text>
<path
class="displacy-arrowhead"
d="M70,266.5 L62,254.5 78,254.5"
fill="currentColor"
></path>
</g>
<g class="displacy-arrow">
<path
class="displacy-arc"
id="arrow-0-1"
stroke-width="2px"
d="M245,264.5 C245,177.0 390.0,177.0 390.0,264.5"
fill="none"
stroke="currentColor"
></path>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textpath
xlink:href="#arrow-0-1"
class="displacy-label"
startOffset="50%"
fill="currentColor"
text-anchor="middle"
>
nsubj
</textpath>
</text>
<path
class="displacy-arrowhead"
d="M245,266.5 L237,254.5 253,254.5"
fill="currentColor"
></path>
</g>
<g class="displacy-arrow">
<path
class="displacy-arc"
id="arrow-0-2"
stroke-width="2px"
d="M595,264.5 C595,177.0 740.0,177.0 740.0,264.5"
fill="none"
stroke="currentColor"
></path>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textpath
xlink:href="#arrow-0-2"
class="displacy-label"
startOffset="50%"
fill="currentColor"
text-anchor="middle"
>
compound
</textpath>
</text>
<path
class="displacy-arrowhead"
d="M595,266.5 L587,254.5 603,254.5"
fill="currentColor"
></path>
</g>
<g class="displacy-arrow">
<path
class="displacy-arc"
id="arrow-0-3"
stroke-width="2px"
d="M420,264.5 C420,89.5 745.0,89.5 745.0,264.5"
fill="none"
stroke="currentColor"
></path>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textpath
xlink:href="#arrow-0-3"
class="displacy-label"
startOffset="50%"
fill="currentColor"
text-anchor="middle"
>
dobj
</textpath>
</text>
<path
class="displacy-arrowhead"
d="M745.0,266.5 L753.0,254.5 737.0,254.5"
fill="currentColor"
></path>
</g>
<g class="displacy-arrow">
<path
class="displacy-arc"
id="arrow-0-4"
stroke-width="2px"
d="M420,264.5 C420,2.0 925.0,2.0 925.0,264.5"
fill="none"
stroke="currentColor"
></path>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textpath
xlink:href="#arrow-0-4"
class="displacy-label"
startOffset="50%"
fill="currentColor"
text-anchor="middle"
>
prep
</textpath>
</text>
<path
class="displacy-arrowhead"
d="M925.0,266.5 L933.0,254.5 917.0,254.5"
fill="currentColor"
></path>
</g>
<g class="displacy-arrow">
<path
class="displacy-arc"
id="arrow-0-5"
stroke-width="2px"
d="M945,264.5 C945,177.0 1090.0,177.0 1090.0,264.5"
fill="none"
stroke="currentColor"
></path>
<text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
<textpath
xlink:href="#arrow-0-5"
class="displacy-label"
startOffset="50%"
fill="currentColor"
text-anchor="middle"
>
pobj
</textpath>
</text>
<path
class="displacy-arrowhead"
d="M1090.0,266.5 L1098.0,254.5 1082.0,254.5"
fill="currentColor"
></path>
</g>
</svg>

After

Width:  |  Height:  |  Size: 6.8 KiB

View File

@ -1,84 +0,0 @@
<div
class="spans"
style="
line-height: 2.5;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
font-size: 18px;
direction: ltr;
"
>
Welcome to the
<span style="font-weight: bold; display: inline-block; position: relative">
Bank
<span
style="
background: #ddd;
top: 40px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
<span
style="
background: #ddd;
top: 40px;
height: 4px;
border-top-left-radius: 3px;
border-bottom-left-radius: 3px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
<span
style="
background: #ddd;
color: #000;
top: -0.5em;
padding: 2px 3px;
position: absolute;
font-size: 0.6em;
font-weight: bold;
line-height: 1;
border-radius: 3px;
"
>
BANK
</span>
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative">
of
<span
style="
background: #ddd;
top: 40px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative">
China
<span
style="
background: #ddd;
top: 40px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
</span>
.
</div>

View File

@ -1,123 +0,0 @@
<div
class="spans"
style="
line-height: 2.5;
direction: ltr;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif,
'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol';
font-size: 18px;
"
>
Welcome to the
<span style="font-weight: bold; display: inline-block; position: relative">
Bank
<span
style="
background: #7aecec;
top: 40px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
<span
style="
background: #7aecec;
top: 40px;
height: 4px;
border-top-left-radius: 3px;
border-bottom-left-radius: 3px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
<span
style="
background: #7aecec;
color: #000;
top: -0.5em;
padding: 2px 3px;
position: absolute;
font-size: 0.6em;
font-weight: bold;
line-height: 1;
border-radius: 3px;
"
>
ORG
</span>
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative">
of
<span
style="
background: #7aecec;
top: 40px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
</span>
<span style="font-weight: bold; display: inline-block; position: relative">
China
<span
style="
background: #7aecec;
top: 40px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
<span
style="
background: #feca74;
top: 57px;
height: 4px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
</span>
<span
style="
background: #feca74;
top: 57px;
height: 4px;
border-top-left-radius: 3px;
border-bottom-left-radius: 3px;
left: -1px;
width: calc(100% + 2px);
position: absolute;
"
>
<span
style="
background: #feca74;
color: #000;
top: -0.5em;
padding: 2px 3px;
position: absolute;
font-size: 0.6em;
font-weight: bold;
line-height: 1;
border-radius: 3px;
"
>
GPE
</span>
</span>
</span>
.
</div>

View File

@ -107,6 +107,22 @@ const Image = ({ src, alt, title, href, ...props }) => {
) )
} }
const ImageScrollable = ({ src, alt, width, ...props }) => {
return (
<figure className={classNames(classes.standalone, classes.scrollable)}>
<img className={classes['image-scrollable']} src={src} alt={alt} width={width} height="auto" />
</figure>
)
}
const Standalone = ({ height, children, ...props }) => {
return (
<figure className={classes.standalone} style={{ height }}>
{children}
</figure>
)
}
const ImageFill = ({ image, ...props }) => { const ImageFill = ({ image, ...props }) => {
return ( return (
<span <span
@ -137,4 +153,4 @@ const GoogleSheet = ({ id, link, height, button = 'View full table' }) => {
) )
} }
export { YouTube, SoundCloud, Iframe, Image, ImageFill, GoogleSheet } export { YouTube, SoundCloud, Iframe, Image, ImageFill, ImageScrollable, GoogleSheet, Standalone }

View File

@ -13,7 +13,7 @@ import Aside from './components/aside'
import Button from './components/button' import Button from './components/button'
import Tag from './components/tag' import Tag from './components/tag'
import Grid from './components/grid' import Grid from './components/grid'
import { YouTube, SoundCloud, Iframe, Image, GoogleSheet } from './components/embed' import { YouTube, SoundCloud, Iframe, Image, ImageScrollable, GoogleSheet, Standalone } from './components/embed'
import Project from './widgets/project' import Project from './widgets/project'
import { Integration, IntegrationLogo } from './widgets/integration.js' import { Integration, IntegrationLogo } from './widgets/integration.js'
import { Logos, Colors, Patterns } from './widgets/styleguide' import { Logos, Colors, Patterns } from './widgets/styleguide'
@ -90,6 +90,8 @@ export const remarkComponents = {
* For regular img elements it is not possible to pass properties * For regular img elements it is not possible to pass properties
*/ */
Image, Image,
ImageScrollable,
Standalone,
Label, Label,
Logos, Logos,

Some files were not shown because too many files have changed in this diff Show More