diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d60c90c1c..976b1f4f2 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -45,6 +45,12 @@ jobs:
run: |
python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
+ - name: cython-lint
+ run: |
+ python -m pip install cython-lint -c requirements.txt
+ # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
+ cython-lint spacy --ignore E501,W291,E266
+
tests:
name: Test
needs: Validate
@@ -52,10 +58,8 @@ jobs:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
- python_version: ["3.11"]
+ python_version: ["3.11", "3.12.0-rc.2"]
include:
- - os: ubuntu-20.04
- python_version: "3.6"
- os: windows-latest
python_version: "3.7"
- os: macos-latest
@@ -89,7 +93,7 @@ jobs:
- name: Run mypy
run: |
python -m mypy spacy
- if: matrix.python_version != '3.6'
+ if: matrix.python_version != '3.7'
- name: Delete source directory and .egg-info
run: |
diff --git a/README.md b/README.md
index 59d3ee9ee..b2ffa4639 100644
--- a/README.md
+++ b/README.md
@@ -6,23 +6,20 @@ spaCy is a library for **advanced Natural Language Processing** in Python and
Cython. It's built on the very latest research, and was designed from day one to
be used in real products.
-spaCy comes with
-[pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **70+ languages**. It features
-state-of-the-art speed and **neural network models** for tagging,
-parsing, **named entity recognition**, **text classification** and more,
-multi-task learning with pretrained **transformers** like BERT, as well as a
+spaCy comes with [pretrained pipelines](https://spacy.io/models) and currently
+supports tokenization and training for **70+ languages**. It features
+state-of-the-art speed and **neural network models** for tagging, parsing,
+**named entity recognition**, **text classification** and more, multi-task
+learning with pretrained **transformers** like BERT, as well as a
production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
+open-source software, released under the
+[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
-💥 **We'd love to hear more about your experience with spaCy!**
-[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
-
-💫 **Version 3.5 out now!**
+💫 **Version 3.7 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
-[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
+[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
[](https://github.com/explosion/spaCy/releases)
[](https://pypi.org/project/spacy/)
[](https://anaconda.org/conda-forge/spacy)
@@ -35,22 +32,22 @@ open-source software, released under the [MIT license](https://github.com/explos
## 📖 Documentation
-| Documentation | |
-| ----------------------------- | ---------------------------------------------------------------------- |
-| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
-| 📚 **[Usage Guides]** | How to use spaCy and its features. |
-| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
-| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
-| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
-| 📦 **[Models]** | Download trained pipelines for spaCy. |
-| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
-| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
-| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
-| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
-| 🛠 **[Changelog]** | Changes and version history. |
-| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
-| | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
-| | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
+| Documentation | |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
+| 📚 **[Usage Guides]** | How to use spaCy and its features. |
+| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
+| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
+| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
+| 📦 **[Models]** | Download trained pipelines for spaCy. |
+| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
+| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
+| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
+| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
+| 🛠 **[Changelog]** | Changes and version history. |
+| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
+| | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
+| | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3
@@ -58,7 +55,7 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
[universe]: https://spacy.io/universe
-[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
+[spacy vs code extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects
@@ -92,7 +89,9 @@ more people can benefit from it.
- State-of-the-art speed
- Production-ready **training system**
- Linguistically-motivated **tokenization**
-- Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more
+- Components for named **entity recognition**, part-of-speech-tagging,
+ dependency parsing, sentence segmentation, **text classification**,
+ lemmatization, morphological analysis, entity linking and more
- Easily extensible with **custom components** and attributes
- Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
- Built in **visualizers** for syntax and NER
@@ -109,7 +108,7 @@ For detailed installation instructions, see the
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
Studio)
-- **Python version**: Python 3.6+ (only 64 bit)
+- **Python version**: Python 3.7+ (only 64 bit)
- **Package managers**: [pip] · [conda] (via `conda-forge`)
[pip]: https://pypi.org/project/spacy/
@@ -118,8 +117,8 @@ For detailed installation instructions, see the
### pip
Using pip, spaCy releases are available as source packages and binary wheels.
-Before you install spaCy and its dependencies, make sure that
-your `pip`, `setuptools` and `wheel` are up to date.
+Before you install spaCy and its dependencies, make sure that your `pip`,
+`setuptools` and `wheel` are up to date.
```bash
pip install -U pip setuptools wheel
@@ -174,9 +173,9 @@ with the new version.
## 📦 Download model packages
-Trained pipelines for spaCy can be installed as **Python packages**. This
-means that they're a component of your application, just like any other module.
-Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
+Trained pipelines for spaCy can be installed as **Python packages**. This means
+that they're a component of your application, just like any other module. Models
+can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
command, or manually by pointing pip to a path or URL.
| Documentation | |
@@ -242,8 +241,7 @@ do that depends on your system.
| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. |
| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
-For more details
-and instructions, see the documentation on
+For more details and instructions, see the documentation on
[compiling spaCy from source](https://spacy.io/usage#source) and the
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
commands for your platform and Python version.
diff --git a/build-constraints.txt b/build-constraints.txt
index c1e82f1b0..b1cf596ca 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -1,9 +1,6 @@
-# build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
-numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
+# build version constraints for use with wheelwright
+numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
+numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
-numpy==1.19.3; python_version=='3.9'
-numpy==1.21.3; python_version=='3.10'
-numpy==1.23.2; python_version=='3.11'
-numpy; python_version>='3.12'
+numpy>=1.25.0; python_version>='3.9'
diff --git a/extra/DEVELOPER_DOCS/Listeners.md b/extra/DEVELOPER_DOCS/Listeners.md
index 3a71082e0..72c036880 100644
--- a/extra/DEVELOPER_DOCS/Listeners.md
+++ b/extra/DEVELOPER_DOCS/Listeners.md
@@ -1,14 +1,17 @@
# Listeners
-1. [Overview](#1-overview)
-2. [Initialization](#2-initialization)
- - [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
- - [B. Shape inference](#2b-shape-inference)
-3. [Internal communication](#3-internal-communication)
- - [A. During prediction](#3a-during-prediction)
- - [B. During training](#3b-during-training)
- - [C. Frozen components](#3c-frozen-components)
-4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
+- [1. Overview](#1-overview)
+- [2. Initialization](#2-initialization)
+ - [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
+ - [2B. Shape inference](#2b-shape-inference)
+- [3. Internal communication](#3-internal-communication)
+ - [3A. During prediction](#3a-during-prediction)
+ - [3B. During training](#3b-during-training)
+ - [Training with multiple listeners](#training-with-multiple-listeners)
+ - [3C. Frozen components](#3c-frozen-components)
+ - [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
+ - [The upstream component is frozen](#the-upstream-component-is-frozen)
+- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
## 1. Overview
@@ -62,7 +65,7 @@ of this `find_listener()` method will specifically identify sublayers of a model
If it's a Transformer-based pipeline, a
[`transformer` component](https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py)
-has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
+has a similar implementation but its `find_listener()` function will specifically look for `TransformerListener`
sublayers of downstream components.
### 2B. Shape inference
@@ -154,7 +157,7 @@ as a tagger or a parser. This used to be impossible before 3.1, but has become s
embedding component in the [`annotating_components`](https://spacy.io/usage/training#annotating-components)
list of the config. This works like any other "annotating component" because it relies on the `Doc` attributes.
-However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
+However, if the `Tok2Vec` or `Transformer` is frozen, and not present in `annotating_components`, and a related
listener isn't frozen, then a `W086` warning is shown and further training of the pipeline will likely end with `E954`.
#### The upstream component is frozen
@@ -216,5 +219,17 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
```
The new config and model are then properly stored on the `nlp` object.
-Note that this functionality (running the replacement for a transformer listener) was broken prior to
+Note that this functionality (running the replacement for a transformer listener) was broken prior to
`spacy-transformers` 1.0.5.
+
+In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
+the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
+the method only passes these extra arguments for callbacks that support them:
+
+```
+def replace_listener_pre_37(copied_tok2vec_model):
+ ...
+
+def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
+ ...
+```
diff --git a/pyproject.toml b/pyproject.toml
index dcb5cf10d..336c0793c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,8 +5,9 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
- "thinc>=8.1.8,<8.2.0",
- "numpy>=1.15.0",
+ "thinc>=8.1.8,<8.3.0",
+ "numpy>=1.15.0; python_version < '3.9'",
+ "numpy>=1.25.0; python_version >= '3.9'",
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index a007f495e..a8ba956a1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc>=8.1.8,<8.2.0
+thinc>=8.1.8,<8.3.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
@@ -12,11 +12,13 @@ catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
+weasel>=0.1.0,<0.4.0
# Third party dependencies
-numpy>=1.15.0
+numpy>=1.15.0; python_version < "3.9"
+numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities
@@ -31,11 +33,11 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
-types-dataclasses>=0.1.3; python_version < "3.7"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black==22.3.0
+cython-lint>=0.15.0
isort>=5.0,<6.0
diff --git a/setup.cfg b/setup.cfg
index 45734888f..75f2e3a15 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,7 +17,6 @@ classifiers =
Operating System :: Microsoft :: Windows
Programming Language :: Cython
Programming Language :: Python :: 3
- Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
@@ -31,15 +30,18 @@ project_urls =
[options]
zip_safe = false
include_package_data = true
-python_requires = >=3.6
+python_requires = >=3.7
+# NOTE: This section is superseded by pyproject.toml and will be removed in
+# spaCy v4
setup_requires =
cython>=0.25,<3.0
- numpy>=1.15.0
+ numpy>=1.15.0; python_version < "3.9"
+ numpy>=1.19.0; python_version >= "3.9"
# We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc>=8.1.8,<8.2.0
+ thinc>=8.1.8,<8.3.0
install_requires =
# Our libraries
spacy-legacy>=3.0.11,<3.1.0
@@ -47,18 +49,20 @@ install_requires =
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc>=8.1.8,<8.2.0
+ thinc>=8.1.8,<8.3.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
+ weasel>=0.1.0,<0.4.0
# Third-party dependencies
typer>=0.3.0,<0.10.0
pathy>=0.10.0
smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0
- numpy>=1.15.0
+ numpy>=1.15.0; python_version < "3.9"
+ numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0
- pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
+ pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
jinja2
# Official Python utilities
setuptools
@@ -74,9 +78,7 @@ console_scripts =
lookups =
spacy_lookups_data>=1.0.3,<1.1.0
transformers =
- spacy_transformers>=1.1.2,<1.3.0
-ray =
- spacy_ray>=0.1.0,<1.0.0
+ spacy_transformers>=1.1.2,<1.4.0
cuda =
cupy>=5.0.0b4,<13.0.0
cuda80 =
@@ -111,6 +113,8 @@ cuda117 =
cupy-cuda117>=5.0.0b4,<13.0.0
cuda11x =
cupy-cuda11x>=11.0.0,<13.0.0
+cuda12x =
+ cupy-cuda12x>=11.5.0,<13.0.0
cuda-autodetect =
cupy-wheel>=11.0.0,<13.0.0
apple =
diff --git a/setup.py b/setup.py
index 243554c7a..33178662d 100755
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,9 @@
#!/usr/bin/env python
from setuptools import Extension, setup, find_packages
import sys
-import platform
import numpy
-from distutils.command.build_ext import build_ext
-from distutils.sysconfig import get_python_inc
+from setuptools.command.build_ext import build_ext
+from sysconfig import get_path
from pathlib import Path
import shutil
from Cython.Build import cythonize
@@ -79,6 +78,7 @@ COMPILER_DIRECTIVES = {
"language_level": -3,
"embedsignature": True,
"annotation_typing": False,
+ "profile": sys.version_info < (3, 12),
}
# Files to copy into the package that are otherwise not included
COPY_FILES = {
@@ -88,30 +88,6 @@ COPY_FILES = {
}
-def is_new_osx():
- """Check whether we're on OSX >= 10.7"""
- if sys.platform != "darwin":
- return False
- mac_ver = platform.mac_ver()[0]
- if mac_ver.startswith("10"):
- minor_version = int(mac_ver.split(".")[1])
- if minor_version >= 7:
- return True
- else:
- return False
- return False
-
-
-if is_new_osx():
- # On Mac, use libc++ because Apple deprecated use of
- # libstdc
- COMPILE_OPTIONS["other"].append("-stdlib=libc++")
- LINK_OPTIONS["other"].append("-lc++")
- # g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
- # See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
- LINK_OPTIONS["other"].append("-nodefaultlibs")
-
-
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
class build_ext_options:
@@ -204,7 +180,7 @@ def setup_package():
include_dirs = [
numpy.get_include(),
- get_python_inc(plat_specific=True),
+ get_path("include"),
]
ext_modules = []
ext_modules.append(
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 1a18ad0d5..8aa2eccd7 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -13,7 +13,6 @@ from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
from . import pipeline # noqa: F401
from . import util
from .about import __version__ # noqa: F401
-from .cli.info import info # noqa: F401
from .errors import Errors
from .glossary import explain # noqa: F401
from .language import Language
@@ -77,3 +76,9 @@ def blank(
# We should accept both dot notation and nested dict here for consistency
config = util.dot_to_dict(config)
return LangClass.from_config(config, vocab=vocab, meta=meta)
+
+
+def info(*args, **kwargs):
+ from .cli.info import info as cli_info
+
+ return cli_info(*args, **kwargs)
diff --git a/spacy/about.py b/spacy/about.py
index cad6158da..1a3367673 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,7 +1,5 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.6.0"
+__version__ = "3.7.0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
-__projects__ = "https://github.com/explosion/projects"
-__projects_branch__ = "v3"
diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd
index 6dc9ecaee..fbbac0ec2 100644
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@@ -96,4 +96,4 @@ cdef enum attr_id_t:
ENT_ID = symbols.ENT_ID
IDX
- SENT_END
\ No newline at end of file
+ SENT_END
diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx
index dc8eed7c3..363dd094d 100644
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
from .errors import Errors
IOB_STRINGS = ("", "I", "O", "B")
@@ -117,7 +118,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if "morph" in stringy_attrs:
- morphs = stringy_attrs.pop("morph")
+ morphs = stringy_attrs.pop("morph") # no-cython-lint
if "number" in stringy_attrs:
stringy_attrs.pop("number")
if "tenspect" in stringy_attrs:
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 549a27616..f3c6dbfed 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -14,6 +14,7 @@ from .debug_diff import debug_diff # noqa: F401
from .debug_model import debug_model # noqa: F401
from .download import download # noqa: F401
from .evaluate import evaluate # noqa: F401
+from .find_function import find_function # noqa: F401
from .find_threshold import find_threshold # noqa: F401
from .info import info # noqa: F401
from .init_config import fill_config, init_config # noqa: F401
@@ -21,13 +22,6 @@ from .init_pipeline import init_pipeline_cli # noqa: F401
from .package import package # noqa: F401
from .pretrain import pretrain # noqa: F401
from .profile import profile # noqa: F401
-from .project.assets import project_assets # noqa: F401
-from .project.clone import project_clone # noqa: F401
-from .project.document import project_document # noqa: F401
-from .project.dvc import project_update_dvc # noqa: F401
-from .project.pull import project_pull # noqa: F401
-from .project.push import project_push # noqa: F401
-from .project.run import project_run # noqa: F401
from .train import train_cli # noqa: F401
from .validate import validate # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index eff897316..bc6c53cd9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -25,10 +25,11 @@ from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available
from typer.main import get_command
from wasabi import Printer, msg
+from weasel import app as project_cli
from .. import about
from ..compat import Literal
-from ..schemas import ProjectConfigSchema, validate
+from ..schemas import validate
from ..util import (
ENV_VARS,
SimpleFrozenDict,
@@ -48,7 +49,6 @@ SDIST_SUFFIX = ".tar.gz"
WHEEL_SUFFIX = "-py3-none-any.whl"
PROJECT_FILE = "project.yml"
-PROJECT_LOCK = "project.lock"
COMMAND = "python -m spacy"
NAME = "spacy"
HELP = """spaCy Command-line Interface
@@ -74,11 +74,10 @@ Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
-project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
-app.add_typer(project_cli)
+app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
app.add_typer(debug_cli)
app.add_typer(benchmark_cli)
app.add_typer(init_cli)
@@ -153,148 +152,6 @@ def _parse_override(value: Any) -> Any:
return str(value)
-def load_project_config(
- path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
-) -> Dict[str, Any]:
- """Load the project.yml file from a directory and validate it. Also make
- sure that all directories defined in the config exist.
-
- path (Path): The path to the project directory.
- interpolate (bool): Whether to substitute project variables.
- overrides (Dict[str, Any]): Optional config overrides.
- RETURNS (Dict[str, Any]): The loaded project.yml.
- """
- config_path = path / PROJECT_FILE
- if not config_path.exists():
- msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
- invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
- try:
- config = srsly.read_yaml(config_path)
- except ValueError as e:
- msg.fail(invalid_err, e, exits=1)
- errors = validate(ProjectConfigSchema, config)
- if errors:
- msg.fail(invalid_err)
- print("\n".join(errors))
- sys.exit(1)
- validate_project_version(config)
- validate_project_commands(config)
- if interpolate:
- err = f"{PROJECT_FILE} validation error"
- with show_validation_error(title=err, hint_fill=False):
- config = substitute_project_variables(config, overrides)
- # Make sure directories defined in config exist
- for subdir in config.get("directories", []):
- dir_path = path / subdir
- if not dir_path.exists():
- dir_path.mkdir(parents=True)
- return config
-
-
-def substitute_project_variables(
- config: Dict[str, Any],
- overrides: Dict[str, Any] = SimpleFrozenDict(),
- key: str = "vars",
- env_key: str = "env",
-) -> Dict[str, Any]:
- """Interpolate variables in the project file using the config system.
-
- config (Dict[str, Any]): The project config.
- overrides (Dict[str, Any]): Optional config overrides.
- key (str): Key containing variables in project config.
- env_key (str): Key containing environment variable mapping in project config.
- RETURNS (Dict[str, Any]): The interpolated project config.
- """
- config.setdefault(key, {})
- config.setdefault(env_key, {})
- # Substitute references to env vars with their values
- for config_var, env_var in config[env_key].items():
- config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
- # Need to put variables in the top scope again so we can have a top-level
- # section "project" (otherwise, a list of commands in the top scope wouldn't)
- # be allowed by Thinc's config system
- cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
- cfg = Config().from_str(cfg.to_str(), overrides=overrides)
- interpolated = cfg.interpolate()
- return dict(interpolated["project"])
-
-
-def validate_project_version(config: Dict[str, Any]) -> None:
- """If the project defines a compatible spaCy version range, chec that it's
- compatible with the current version of spaCy.
-
- config (Dict[str, Any]): The loaded config.
- """
- spacy_version = config.get("spacy_version", None)
- if spacy_version and not is_compatible_version(about.__version__, spacy_version):
- err = (
- f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
- f"that's not compatible with the version of spaCy you're running "
- f"({about.__version__}). You can edit version requirement in the "
- f"{PROJECT_FILE} to load it, but the project may not run as expected."
- )
- msg.fail(err, exits=1)
-
-
-def validate_project_commands(config: Dict[str, Any]) -> None:
- """Check that project commands and workflows are valid, don't contain
- duplicates, don't clash and only refer to commands that exist.
-
- config (Dict[str, Any]): The loaded config.
- """
- command_names = [cmd["name"] for cmd in config.get("commands", [])]
- workflows = config.get("workflows", {})
- duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
- if duplicates:
- err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
- msg.fail(err, exits=1)
- for workflow_name, workflow_steps in workflows.items():
- if workflow_name in command_names:
- err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
- msg.fail(err, exits=1)
- for step in workflow_steps:
- if step not in command_names:
- msg.fail(
- f"Unknown command specified in workflow '{workflow_name}': {step}",
- f"Workflows can only refer to commands defined in the 'commands' "
- f"section of the {PROJECT_FILE}.",
- exits=1,
- )
-
-
-def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
- """Get the hash for a JSON-serializable object.
-
- data: The data to hash.
- exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
- RETURNS (str): The hash.
- """
- if isinstance(data, dict):
- data = {k: v for k, v in data.items() if k not in exclude}
- data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
- return hashlib.md5(data_str).hexdigest()
-
-
-def get_checksum(path: Union[Path, str]) -> str:
- """Get the checksum for a file or directory given its file path. If a
- directory path is provided, this uses all files in that directory.
-
- path (Union[Path, str]): The file or directory path.
- RETURNS (str): The checksum.
- """
- path = Path(path)
- if not (path.is_file() or path.is_dir()):
- msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
- if path.is_file():
- return hashlib.md5(Path(path).read_bytes()).hexdigest()
- else:
- # TODO: this is currently pretty slow
- dir_checksum = hashlib.md5()
- for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
- dir_checksum.update(sub_file.read_bytes())
- return dir_checksum.hexdigest()
-
-
@contextmanager
def show_validation_error(
file_path: Optional[Union[str, Path]] = None,
@@ -352,166 +209,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
-def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
- """Upload a file.
-
- src (Path): The source path.
- url (str): The destination URL to upload to.
- """
- import smart_open
-
- # Create parent directories for local paths
- if isinstance(dest, Path):
- if not dest.parent.exists():
- dest.parent.mkdir(parents=True)
-
- dest = str(dest)
- with smart_open.open(dest, mode="wb") as output_file:
- with src.open(mode="rb") as input_file:
- output_file.write(input_file.read())
-
-
-def download_file(
- src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
-) -> None:
- """Download a file using smart_open.
-
- url (str): The URL of the file.
- dest (Path): The destination path.
- force (bool): Whether to force download even if file exists.
- If False, the download will be skipped.
- """
- import smart_open
-
- if dest.exists() and not force:
- return None
- src = str(src)
- with smart_open.open(src, mode="rb", compression="disable") as input_file:
- with dest.open(mode="wb") as output_file:
- shutil.copyfileobj(input_file, output_file)
-
-
-def ensure_pathy(path):
- """Temporary helper to prevent importing Pathy globally (which can cause
- slow and annoying Google Cloud warning)."""
- from pathy import Pathy # noqa: F811
-
- return Pathy.fluid(path)
-
-
-def git_checkout(
- repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
-):
- git_version = get_git_version()
- if dest.exists():
- msg.fail("Destination of checkout must not exist", exits=1)
- if not dest.parent.exists():
- msg.fail("Parent of destination of checkout must exist", exits=1)
- if sparse and git_version >= (2, 22):
- return git_sparse_checkout(repo, subpath, dest, branch)
- elif sparse:
- # Only show warnings if the user explicitly wants sparse checkout but
- # the Git version doesn't support it
- err_old = (
- f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
- f"that doesn't fully support sparse checkout yet."
- )
- err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
- msg.warn(
- f"{err_unk if git_version == (0, 0) else err_old} "
- f"This means that more files than necessary may be downloaded "
- f"temporarily. To only download the files needed, make sure "
- f"you're using Git v2.22 or above."
- )
- with make_tempdir() as tmp_dir:
- cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
- run_command(cmd, capture=True)
- # We need Path(name) to make sure we also support subdirectories
- try:
- source_path = tmp_dir / Path(subpath)
- if not is_subpath_of(tmp_dir, source_path):
- err = f"'{subpath}' is a path outside of the cloned repository."
- msg.fail(err, repo, exits=1)
- shutil.copytree(str(source_path), str(dest))
- except FileNotFoundError:
- err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
- msg.fail(err, repo, exits=1)
-
-
-def git_sparse_checkout(repo, subpath, dest, branch):
- # We're using Git, partial clone and sparse checkout to
- # only clone the files we need
- # This ends up being RIDICULOUS. omg.
- # So, every tutorial and SO post talks about 'sparse checkout'...But they
- # go and *clone* the whole repo. Worthless. And cloning part of a repo
- # turns out to be completely broken. The only way to specify a "path" is..
- # a path *on the server*? The contents of which, specifies the paths. Wat.
- # Obviously this is hopelessly broken and insecure, because you can query
- # arbitrary paths on the server! So nobody enables this.
- # What we have to do is disable *all* files. We could then just checkout
- # the path, and it'd "work", but be hopelessly slow...Because it goes and
- # transfers every missing object one-by-one. So the final piece is that we
- # need to use some weird git internals to fetch the missings in bulk, and
- # *that* we can do by path.
- # We're using Git and sparse checkout to only clone the files we need
- with make_tempdir() as tmp_dir:
- # This is the "clone, but don't download anything" part.
- cmd = (
- f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
- f"-b {branch} --filter=blob:none"
- )
- run_command(cmd)
- # Now we need to find the missing filenames for the subpath we want.
- # Looking for this 'rev-list' command in the git --help? Hah.
- cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
- ret = run_command(cmd, capture=True)
- git_repo = _http_to_git(repo)
- # Now pass those missings into another bit of git internals
- missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
- if not missings:
- err = (
- f"Could not find any relevant files for '{subpath}'. "
- f"Did you specify a correct and complete path within repo '{repo}' "
- f"and branch {branch}?"
- )
- msg.fail(err, exits=1)
- cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
- run_command(cmd, capture=True)
- # And finally, we can checkout our subpath
- cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
- run_command(cmd, capture=True)
-
- # Get a subdirectory of the cloned path, if appropriate
- source_path = tmp_dir / Path(subpath)
- if not is_subpath_of(tmp_dir, source_path):
- err = f"'{subpath}' is a path outside of the cloned repository."
- msg.fail(err, repo, exits=1)
-
- shutil.move(str(source_path), str(dest))
-
-
-def git_repo_branch_exists(repo: str, branch: str) -> bool:
- """Uses 'git ls-remote' to check if a repository and branch exists
-
- repo (str): URL to get repo.
- branch (str): Branch on repo to check.
- RETURNS (bool): True if repo:branch exists.
- """
- get_git_version()
- cmd = f"git ls-remote {repo} {branch}"
- # We might be tempted to use `--exit-code` with `git ls-remote`, but
- # `run_command` handles the `returncode` for us, so we'll rely on
- # the fact that stdout returns '' if the requested branch doesn't exist
- ret = run_command(cmd, capture=True)
- exists = ret.stdout != ""
- return exists
-
-
def get_git_version(
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
) -> Tuple[int, int]:
"""Get the version of git and raise an error if calling 'git --version' fails.
-
error (str): The error message to show.
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined.
@@ -527,30 +228,6 @@ def get_git_version(
return int(version[0]), int(version[1])
-def _http_to_git(repo: str) -> str:
- if repo.startswith("http://"):
- repo = repo.replace(r"http://", r"https://")
- if repo.startswith(r"https://"):
- repo = repo.replace("https://", "git@").replace("/", ":", 1)
- if repo.endswith("/"):
- repo = repo[:-1]
- repo = f"{repo}.git"
- return repo
-
-
-def is_subpath_of(parent, child):
- """
- Check whether `child` is a path contained within `parent`.
- """
- # Based on https://stackoverflow.com/a/37095733 .
-
- # In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
- # we can stop using crusty old os.path functions.
- parent_realpath = os.path.realpath(parent)
- child_realpath = os.path.realpath(child)
- return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
-
-
@overload
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
...
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
index 8c4b4c8bf..ffd810506 100644
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@@ -133,7 +133,9 @@ def apply(
if len(text_files) > 0:
streams.append(_stream_texts(text_files))
datagen = cast(DocOrStrStream, chain(*streams))
- for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
+ for doc in tqdm.tqdm(
+ nlp.pipe(datagen, batch_size=batch_size, n_process=n_process), disable=None
+ ):
docbin.add(doc)
if output_file.suffix == "":
output_file = output_file.with_suffix(".spacy")
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
index ee2500b27..f74bbacb5 100644
--- a/spacy/cli/assemble.py
+++ b/spacy/cli/assemble.py
@@ -40,7 +40,8 @@ def assemble_cli(
DOCS: https://spacy.io/api/cli#assemble
"""
- util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ if verbose:
+ util.logger.setLevel(logging.DEBUG)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
index a683d1591..c7fd771c3 100644
--- a/spacy/cli/benchmark_speed.py
+++ b/spacy/cli/benchmark_speed.py
@@ -89,7 +89,7 @@ class Quartiles:
def annotate(
nlp: Language, docs: List[Doc], batch_size: Optional[int]
) -> numpy.ndarray:
- docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
+ docs = nlp.pipe(tqdm(docs, unit="doc", disable=None), batch_size=batch_size)
wps = []
while True:
with time_context() as elapsed:
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 6235b658d..2276ca6b0 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -28,6 +28,7 @@ def evaluate_cli(
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
+ spans_key: str = Opt("sc", "--spans-key", "-sk", help="Spans key to use when evaluating Doc.spans"),
# fmt: on
):
"""
@@ -53,6 +54,7 @@ def evaluate_cli(
displacy_limit=displacy_limit,
per_component=per_component,
silent=False,
+ spans_key=spans_key,
)
diff --git a/spacy/cli/find_function.py b/spacy/cli/find_function.py
new file mode 100644
index 000000000..f99ce2adc
--- /dev/null
+++ b/spacy/cli/find_function.py
@@ -0,0 +1,69 @@
+from typing import Optional, Tuple
+
+from catalogue import RegistryError
+from wasabi import msg
+
+from ..util import registry
+from ._util import Arg, Opt, app
+
+
+@app.command("find-function")
+def find_function_cli(
+ # fmt: off
+ func_name: str = Arg(..., help="Name of the registered function."),
+ registry_name: Optional[str] = Opt(None, "--registry", "-r", help="Name of the catalogue registry."),
+ # fmt: on
+):
+ """
+ Find the module, path and line number to the file the registered
+ function is defined in, if available.
+
+ func_name (str): Name of the registered function.
+ registry_name (Optional[str]): Name of the catalogue registry.
+
+ DOCS: https://spacy.io/api/cli#find-function
+ """
+ if not registry_name:
+ registry_names = registry.get_registry_names()
+ for name in registry_names:
+ if registry.has(name, func_name):
+ registry_name = name
+ break
+
+ if not registry_name:
+ msg.fail(
+ f"Couldn't find registered function: '{func_name}'",
+ exits=1,
+ )
+
+ assert registry_name is not None
+ find_function(func_name, registry_name)
+
+
+def find_function(func_name: str, registry_name: str) -> Tuple[str, int]:
+ registry_desc = None
+ try:
+ registry_desc = registry.find(registry_name, func_name)
+ except RegistryError as e:
+ msg.fail(
+ f"Couldn't find registered function: '{func_name}' in registry '{registry_name}'",
+ )
+ msg.fail(f"{e}", exits=1)
+ assert registry_desc is not None
+
+ registry_path = None
+ line_no = None
+ if registry_desc["file"]:
+ registry_path = registry_desc["file"]
+ line_no = registry_desc["line_no"]
+
+ if not registry_path or not line_no:
+ msg.fail(
+ f"Couldn't find path to registered function: '{func_name}' in registry '{registry_name}'",
+ exits=1,
+ )
+ assert registry_path is not None
+ assert line_no is not None
+
+ msg.good(f"Found registered function '{func_name}' at {registry_path}:{line_no}")
+ return str(registry_path), int(line_no)
diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 7aa32c0c6..48077fa51 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -52,8 +52,8 @@ def find_threshold_cli(
DOCS: https://spacy.io/api/cli#find-threshold
"""
-
- util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ if verbose:
+ util.logger.setLevel(logging.DEBUG)
import_code(code_path)
find_threshold(
model=model,
diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py
index 13202cb60..21eea8edf 100644
--- a/spacy/cli/init_pipeline.py
+++ b/spacy/cli/init_pipeline.py
@@ -39,7 +39,8 @@ def init_vectors_cli(
you can use in the [initialize] block of your config to initialize
a model with vectors.
"""
- util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ if verbose:
+ util.logger.setLevel(logging.DEBUG)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
@@ -87,7 +88,8 @@ def init_pipeline_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
- util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ if verbose:
+ util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
@@ -116,7 +118,8 @@ def init_labels_cli(
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
- util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ if verbose:
+ util.logger.setLevel(logging.DEBUG)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 4545578e6..12f195be1 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -403,7 +403,7 @@ def _format_sources(data: Any) -> str:
if author:
result += " ({})".format(author)
sources.append(result)
- return " ".join(sources)
+ return " ".join(sources)
def _format_accuracy(data: Dict[str, Any], exclude: List[str] = ["speed"]) -> str:
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index e1f720327..e5b8f1193 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -71,7 +71,7 @@ def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) ->
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
- for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
+ for doc in nlp.pipe(tqdm.tqdm(texts, disable=None), batch_size=16):
pass
diff --git a/spacy/cli/project/__init__.py b/spacy/cli/project/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
deleted file mode 100644
index aa2705986..000000000
--- a/spacy/cli/project/assets.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import os
-import re
-import shutil
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import requests
-import typer
-from wasabi import msg
-
-from ...util import ensure_path, working_dir
-from .._util import (
- PROJECT_FILE,
- Arg,
- Opt,
- SimpleFrozenDict,
- download_file,
- get_checksum,
- get_git_version,
- git_checkout,
- load_project_config,
- parse_config_overrides,
- project_cli,
-)
-
-# Whether assets are extra if `extra` is not set.
-EXTRA_DEFAULT = False
-
-
-@project_cli.command(
- "assets",
- context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)
-def project_assets_cli(
- # fmt: off
- ctx: typer.Context, # This is only used to read additional arguments
- project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
- sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
- extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
- # fmt: on
-):
- """Fetch project assets like datasets and pretrained weights. Assets are
- defined in the "assets" section of the project.yml. If a checksum is
- provided in the project.yml, the file is only downloaded if no local file
- with the same checksum exists.
-
- DOCS: https://spacy.io/api/cli#project-assets
- """
- overrides = parse_config_overrides(ctx.args)
- project_assets(
- project_dir,
- overrides=overrides,
- sparse_checkout=sparse_checkout,
- extra=extra,
- )
-
-
-def project_assets(
- project_dir: Path,
- *,
- overrides: Dict[str, Any] = SimpleFrozenDict(),
- sparse_checkout: bool = False,
- extra: bool = False,
-) -> None:
- """Fetch assets for a project using DVC if possible.
-
- project_dir (Path): Path to project directory.
- sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
- needed.
- extra (bool): Whether to download all assets, including those marked as 'extra'.
- """
- project_path = ensure_path(project_dir)
- config = load_project_config(project_path, overrides=overrides)
- assets = [
- asset
- for asset in config.get("assets", [])
- if extra or not asset.get("extra", EXTRA_DEFAULT)
- ]
- if not assets:
- msg.warn(
- f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
- exits=0,
- )
- msg.info(f"Fetching {len(assets)} asset(s)")
-
- for asset in assets:
- dest = (project_dir / asset["dest"]).resolve()
- checksum = asset.get("checksum")
- if "git" in asset:
- git_err = (
- f"Cloning spaCy project templates requires Git and the 'git' command. "
- f"Make sure it's installed and that the executable is available."
- )
- get_git_version(error=git_err)
- if dest.exists():
- # If there's already a file, check for checksum
- if checksum and checksum == get_checksum(dest):
- msg.good(
- f"Skipping download with matching checksum: {asset['dest']}"
- )
- continue
- else:
- if dest.is_dir():
- shutil.rmtree(dest)
- else:
- dest.unlink()
- if "repo" not in asset["git"] or asset["git"]["repo"] is None:
- msg.fail(
- "A git asset must include 'repo', the repository address.", exits=1
- )
- if "path" not in asset["git"] or asset["git"]["path"] is None:
- msg.fail(
- "A git asset must include 'path' - use \"\" to get the entire repository.",
- exits=1,
- )
- git_checkout(
- asset["git"]["repo"],
- asset["git"]["path"],
- dest,
- branch=asset["git"].get("branch"),
- sparse=sparse_checkout,
- )
- msg.good(f"Downloaded asset {dest}")
- else:
- url = asset.get("url")
- if not url:
- # project.yml defines asset without URL that the user has to place
- check_private_asset(dest, checksum)
- continue
- fetch_asset(project_path, url, dest, checksum)
-
-
-def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
- """Check and validate assets without a URL (private assets that the user
- has to provide themselves) and give feedback about the checksum.
-
- dest (Path): Destination path of the asset.
- checksum (Optional[str]): Optional checksum of the expected file.
- """
- if not Path(dest).exists():
- err = f"No URL provided for asset. You need to add this file yourself: {dest}"
- msg.warn(err)
- else:
- if not checksum:
- msg.good(f"Asset already exists: {dest}")
- elif checksum == get_checksum(dest):
- msg.good(f"Asset exists with matching checksum: {dest}")
- else:
- msg.fail(f"Asset available but with incorrect checksum: {dest}")
-
-
-def fetch_asset(
- project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
-) -> None:
- """Fetch an asset from a given URL or path. If a checksum is provided and a
- local file exists, it's only re-downloaded if the checksum doesn't match.
-
- project_path (Path): Path to project directory.
- url (str): URL or path to asset.
- checksum (Optional[str]): Optional expected checksum of local file.
- RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
- the asset failed.
- """
- dest_path = (project_path / dest).resolve()
- if dest_path.exists():
- # If there's already a file, check for checksum
- if checksum:
- if checksum == get_checksum(dest_path):
- msg.good(f"Skipping download with matching checksum: {dest}")
- return
- else:
- # If there's not a checksum, make sure the file is a possibly valid size
- if os.path.getsize(dest_path) == 0:
- msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
- os.remove(dest_path)
- # We might as well support the user here and create parent directories in
- # case the asset dir isn't listed as a dir to create in the project.yml
- if not dest_path.parent.exists():
- dest_path.parent.mkdir(parents=True)
- with working_dir(project_path):
- url = convert_asset_url(url)
- try:
- download_file(url, dest_path)
- msg.good(f"Downloaded asset {dest}")
- except requests.exceptions.RequestException as e:
- if Path(url).exists() and Path(url).is_file():
- # If it's a local file, copy to destination
- shutil.copy(url, str(dest_path))
- msg.good(f"Copied local asset {dest}")
- else:
- msg.fail(f"Download failed: {dest}", e)
- if checksum and checksum != get_checksum(dest_path):
- msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
-
-
-def convert_asset_url(url: str) -> str:
- """Check and convert the asset URL if needed.
-
- url (str): The asset URL.
- RETURNS (str): The converted URL.
- """
- # If the asset URL is a regular GitHub URL it's likely a mistake
- if (
- re.match(r"(http(s?)):\/\/github.com", url)
- and "releases/download" not in url
- and "/raw/" not in url
- ):
- converted = url.replace("github.com", "raw.githubusercontent.com")
- converted = re.sub(r"/(tree|blob)/", "/", converted)
- msg.warn(
- "Downloading from a regular GitHub URL. This will only download "
- "the source of the page, not the actual file. Converting the URL "
- "to a raw URL.",
- converted,
- )
- return converted
- return url
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
deleted file mode 100644
index 2ee27c92a..000000000
--- a/spacy/cli/project/clone.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import re
-import subprocess
-from pathlib import Path
-from typing import Optional
-
-from wasabi import msg
-
-from ... import about
-from ...util import ensure_path
-from .._util import (
- COMMAND,
- PROJECT_FILE,
- Arg,
- Opt,
- get_git_version,
- git_checkout,
- git_repo_branch_exists,
- project_cli,
-)
-
-DEFAULT_REPO = about.__projects__
-DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
-DEFAULT_BRANCHES = ["main", "master"]
-
-
-@project_cli.command("clone")
-def project_clone_cli(
- # fmt: off
- name: str = Arg(..., help="The name of the template to clone"),
- dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
- repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
- branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
- sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
- # fmt: on
-):
- """Clone a project template from a repository. Calls into "git" and will
- only download the files from the given subdirectory. The GitHub repo
- defaults to the official spaCy template repo, but can be customized
- (including using a private repo).
-
- DOCS: https://spacy.io/api/cli#project-clone
- """
- if dest is None:
- dest = Path.cwd() / Path(name).parts[-1]
- if repo == DEFAULT_REPO and branch is None:
- branch = DEFAULT_PROJECTS_BRANCH
-
- if branch is None:
- for default_branch in DEFAULT_BRANCHES:
- if git_repo_branch_exists(repo, default_branch):
- branch = default_branch
- break
- if branch is None:
- default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
- msg.fail(
- "No branch provided and attempted default "
- f"branches {default_branches_msg} do not exist.",
- exits=1,
- )
- else:
- if not git_repo_branch_exists(repo, branch):
- msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
- assert isinstance(branch, str)
- project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
-
-
-def project_clone(
- name: str,
- dest: Path,
- *,
- repo: str = about.__projects__,
- branch: str = about.__projects_branch__,
- sparse_checkout: bool = False,
-) -> None:
- """Clone a project template from a repository.
-
- name (str): Name of subdirectory to clone.
- dest (Path): Destination path of cloned project.
- repo (str): URL of Git repo containing project templates.
- branch (str): The branch to clone from
- """
- dest = ensure_path(dest)
- check_clone(name, dest, repo)
- project_dir = dest.resolve()
- repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
- try:
- git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
- except subprocess.CalledProcessError:
- err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
- msg.fail(err, exits=1)
- msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
- if not (project_dir / PROJECT_FILE).exists():
- msg.warn(f"No {PROJECT_FILE} found in directory")
- else:
- msg.good(f"Your project is now ready!")
- print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
-
-
-def check_clone(name: str, dest: Path, repo: str) -> None:
- """Check and validate that the destination path can be used to clone. Will
- check that Git is available and that the destination path is suitable.
-
- name (str): Name of the directory to clone from the repo.
- dest (Path): Local destination of cloned directory.
- repo (str): URL of the repo to clone from.
- """
- git_err = (
- f"Cloning spaCy project templates requires Git and the 'git' command. "
- f"To clone a project without Git, copy the files from the '{name}' "
- f"directory in the {repo} to {dest} manually."
- )
- get_git_version(error=git_err)
- if not dest:
- msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
- if dest.exists():
- # Directory already exists (not allowed, clone needs to create it)
- msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
- if not dest.parent.exists():
- # We're not creating parents, parent dir should exist
- msg.fail(
- f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
- f"Create the necessary folder(s) first before continuing.",
- exits=1,
- )
diff --git a/spacy/cli/project/document.py b/spacy/cli/project/document.py
deleted file mode 100644
index 80107d27a..000000000
--- a/spacy/cli/project/document.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from pathlib import Path
-
-from wasabi import MarkdownRenderer, msg
-
-from ...util import working_dir
-from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
-
-DOCS_URL = "https://spacy.io"
-INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
-project, as well as the available commands and workflows. For details, see the
-[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
-INTRO_COMMANDS = f"""The following commands are defined by the project. They
-can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
-Commands are only re-run if their inputs have changed."""
-INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
-can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
-and will run the specified commands in order. Commands are only re-run if their
-inputs have changed."""
-INTRO_ASSETS = f"""The following assets are defined by the project. They can
-be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
-in the project directory."""
-# These markers are added to the Markdown and can be used to update the file in
-# place if it already exists. Only the auto-generated part will be replaced.
-MARKER_START = ""
-MARKER_END = ""
-# If this marker is used in an existing README, it's ignored and not replaced
-MARKER_IGNORE = ""
-
-
-@project_cli.command("document")
-def project_document_cli(
- # fmt: off
- project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
- output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
- no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
- # fmt: on
-):
- """
- Auto-generate a README.md for a project. If the content is saved to a file,
- hidden markers are added so you can add custom content before or after the
- auto-generated section and only the auto-generated docs will be replaced
- when you re-run the command.
-
- DOCS: https://spacy.io/api/cli#project-document
- """
- project_document(project_dir, output_file, no_emoji=no_emoji)
-
-
-def project_document(
- project_dir: Path, output_file: Path, *, no_emoji: bool = False
-) -> None:
- is_stdout = str(output_file) == "-"
- config = load_project_config(project_dir)
- md = MarkdownRenderer(no_emoji=no_emoji)
- md.add(MARKER_START)
- title = config.get("title")
- description = config.get("description")
- md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
- if description:
- md.add(description)
- md.add(md.title(2, PROJECT_FILE, "📋"))
- md.add(INTRO_PROJECT)
- # Commands
- cmds = config.get("commands", [])
- data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
- if data:
- md.add(md.title(3, "Commands", "⏯"))
- md.add(INTRO_COMMANDS)
- md.add(md.table(data, ["Command", "Description"]))
- # Workflows
- wfs = config.get("workflows", {}).items()
- data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
- if data:
- md.add(md.title(3, "Workflows", "⏭"))
- md.add(INTRO_WORKFLOWS)
- md.add(md.table(data, ["Workflow", "Steps"]))
- # Assets
- assets = config.get("assets", [])
- data = []
- for a in assets:
- source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
- dest_path = a["dest"]
- dest = md.code(dest_path)
- if source == "Local":
- # Only link assets if they're in the repo
- with working_dir(project_dir) as p:
- if (p / dest_path).exists():
- dest = md.link(dest, dest_path)
- data.append((dest, source, a.get("description", "")))
- if data:
- md.add(md.title(3, "Assets", "🗂"))
- md.add(INTRO_ASSETS)
- md.add(md.table(data, ["File", "Source", "Description"]))
- md.add(MARKER_END)
- # Output result
- if is_stdout:
- print(md.text)
- else:
- content = md.text
- if output_file.exists():
- with output_file.open("r", encoding="utf8") as f:
- existing = f.read()
- if MARKER_IGNORE in existing:
- msg.warn("Found ignore marker in existing file: skipping", output_file)
- return
- if MARKER_START in existing and MARKER_END in existing:
- msg.info("Found existing file: only replacing auto-generated docs")
- before = existing.split(MARKER_START)[0]
- after = existing.split(MARKER_END)[1]
- content = f"{before}{content}{after}"
- else:
- msg.warn("Replacing existing file")
- with output_file.open("w", encoding="utf8") as f:
- f.write(content)
- msg.good("Saved project documentation", output_file)
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
deleted file mode 100644
index 9ad55c433..000000000
--- a/spacy/cli/project/dvc.py
+++ /dev/null
@@ -1,220 +0,0 @@
-"""This module contains helpers and subcommands for integrating spaCy projects
-with Data Version Controk (DVC). https://dvc.org"""
-import subprocess
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
-
-from wasabi import msg
-
-from ...util import (
- SimpleFrozenList,
- join_command,
- run_command,
- split_command,
- working_dir,
-)
-from .._util import (
- COMMAND,
- NAME,
- PROJECT_FILE,
- Arg,
- Opt,
- get_hash,
- load_project_config,
- project_cli,
-)
-
-DVC_CONFIG = "dvc.yaml"
-DVC_DIR = ".dvc"
-UPDATE_COMMAND = "dvc"
-DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
-# edited your {PROJECT_FILE}, you can regenerate this file by running:
-# {COMMAND} project {UPDATE_COMMAND}"""
-
-
-@project_cli.command(UPDATE_COMMAND)
-def project_update_dvc_cli(
- # fmt: off
- project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
- workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
- verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
- quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
- force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
- # fmt: on
-):
- """Auto-generate Data Version Control (DVC) config. A DVC
- project can only define one pipeline, so you need to specify one workflow
- defined in the project.yml. If no workflow is specified, the first defined
- workflow is used. The DVC config will only be updated if the project.yml
- changed.
-
- DOCS: https://spacy.io/api/cli#project-dvc
- """
- project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
-
-
-def project_update_dvc(
- project_dir: Path,
- workflow: Optional[str] = None,
- *,
- verbose: bool = False,
- quiet: bool = False,
- force: bool = False,
-) -> None:
- """Update the auto-generated Data Version Control (DVC) config file. A DVC
- project can only define one pipeline, so you need to specify one workflow
- defined in the project.yml. Will only update the file if the checksum changed.
-
- project_dir (Path): The project directory.
- workflow (Optional[str]): Optional name of workflow defined in project.yml.
- If not set, the first workflow will be used.
- verbose (bool): Print more info.
- quiet (bool): Print less info.
- force (bool): Force update DVC config.
- """
- config = load_project_config(project_dir)
- updated = update_dvc_config(
- project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
- )
- help_msg = "To execute the workflow with DVC, run: dvc repro"
- if updated:
- msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
- else:
- msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
-
-
-def update_dvc_config(
- path: Path,
- config: Dict[str, Any],
- workflow: Optional[str] = None,
- verbose: bool = False,
- quiet: bool = False,
- force: bool = False,
-) -> bool:
- """Re-run the DVC commands in dry mode and update dvc.yaml file in the
- project directory. The file is auto-generated based on the config. The
- first line of the auto-generated file specifies the hash of the config
- dict, so if any of the config values change, the DVC config is regenerated.
-
- path (Path): The path to the project directory.
- config (Dict[str, Any]): The loaded project.yml.
- verbose (bool): Whether to print additional info (via DVC).
- quiet (bool): Don't output anything (via DVC).
- force (bool): Force update, even if hashes match.
- RETURNS (bool): Whether the DVC config file was updated.
- """
- ensure_dvc(path)
- workflows = config.get("workflows", {})
- workflow_names = list(workflows.keys())
- check_workflows(workflow_names, workflow)
- if not workflow:
- workflow = workflow_names[0]
- config_hash = get_hash(config)
- path = path.resolve()
- dvc_config_path = path / DVC_CONFIG
- if dvc_config_path.exists():
- # Check if the file was generated using the current config, if not, redo
- with dvc_config_path.open("r", encoding="utf8") as f:
- ref_hash = f.readline().strip().replace("# ", "")
- if ref_hash == config_hash and not force:
- return False # Nothing has changed in project.yml, don't need to update
- dvc_config_path.unlink()
- dvc_commands = []
- config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
-
- # some flags that apply to every command
- flags = []
- if verbose:
- flags.append("--verbose")
- if quiet:
- flags.append("--quiet")
-
- for name in workflows[workflow]:
- command = config_commands[name]
- deps = command.get("deps", [])
- outputs = command.get("outputs", [])
- outputs_no_cache = command.get("outputs_no_cache", [])
- if not deps and not outputs and not outputs_no_cache:
- continue
- # Default to the working dir as the project path since dvc.yaml is auto-generated
- # and we don't want arbitrary paths in there
- project_cmd = ["python", "-m", NAME, "project", "run", name]
- deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
- outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
- outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
-
- dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
- if command.get("no_skip"):
- dvc_cmd.append("--always-changed")
- full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
- dvc_commands.append(join_command(full_cmd))
-
- if not dvc_commands:
- # If we don't check for this, then there will be an error when reading the
- # config, since DVC wouldn't create it.
- msg.fail(
- "No usable commands for DVC found. This can happen if none of your "
- "commands have dependencies or outputs.",
- exits=1,
- )
-
- with working_dir(path):
- for c in dvc_commands:
- dvc_command = "dvc " + c
- run_command(dvc_command)
- with dvc_config_path.open("r+", encoding="utf8") as f:
- content = f.read()
- f.seek(0, 0)
- f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
- return True
-
-
-def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
- """Validate workflows provided in project.yml and check that a given
- workflow can be used to generate a DVC config.
-
- workflows (List[str]): Names of the available workflows.
- workflow (Optional[str]): The name of the workflow to convert.
- """
- if not workflows:
- msg.fail(
- f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
- f"define at least one list of commands.",
- exits=1,
- )
- if workflow is not None and workflow not in workflows:
- msg.fail(
- f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
- f"Available workflows: {', '.join(workflows)}",
- exits=1,
- )
- if not workflow:
- msg.warn(
- f"No workflow specified for DVC pipeline. Using the first workflow "
- f"defined in {PROJECT_FILE}: '{workflows[0]}'"
- )
-
-
-def ensure_dvc(project_dir: Path) -> None:
- """Ensure that the "dvc" command is available and that the current project
- directory is an initialized DVC project.
- """
- try:
- subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
- except Exception:
- msg.fail(
- "To use spaCy projects with DVC (Data Version Control), DVC needs "
- "to be installed and the 'dvc' command needs to be available",
- "You can install the Python package from pip (pip install dvc) or "
- "conda (conda install -c conda-forge dvc). For more details, see the "
- "documentation: https://dvc.org/doc/install",
- exits=1,
- )
- if not (project_dir / ".dvc").exists():
- msg.fail(
- "Project not initialized as a DVC project",
- "To initialize a DVC project, you can run 'dvc init' in the project "
- "directory. For more details, see the documentation: "
- "https://dvc.org/doc/command-reference/init",
- exits=1,
- )
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
deleted file mode 100644
index e9be74df7..000000000
--- a/spacy/cli/project/pull.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from pathlib import Path
-
-from wasabi import msg
-
-from .._util import Arg, load_project_config, logger, project_cli
-from .remote_storage import RemoteStorage, get_command_hash
-from .run import update_lockfile
-
-
-@project_cli.command("pull")
-def project_pull_cli(
- # fmt: off
- remote: str = Arg("default", help="Name or path of remote storage"),
- project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
- # fmt: on
-):
- """Retrieve available precomputed outputs from a remote storage.
- You can alias remotes in your project.yml by mapping them to storage paths.
- A storage can be anything that the smart-open library can upload to, e.g.
- AWS, Google Cloud Storage, SSH, local directories etc.
-
- DOCS: https://spacy.io/api/cli#project-pull
- """
- for url, output_path in project_pull(project_dir, remote):
- if url is not None:
- msg.good(f"Pulled {output_path} from {url}")
-
-
-def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
- # TODO: We don't have tests for this :(. It would take a bit of mockery to
- # set up. I guess see if it breaks first?
- config = load_project_config(project_dir)
- if remote in config.get("remotes", {}):
- remote = config["remotes"][remote]
- storage = RemoteStorage(project_dir, remote)
- commands = list(config.get("commands", []))
- # We use a while loop here because we don't know how the commands
- # will be ordered. A command might need dependencies from one that's later
- # in the list.
- while commands:
- for i, cmd in enumerate(list(commands)):
- logger.debug("CMD: %s.", cmd["name"])
- deps = [project_dir / dep for dep in cmd.get("deps", [])]
- if all(dep.exists() for dep in deps):
- cmd_hash = get_command_hash("", "", deps, cmd["script"])
- for output_path in cmd.get("outputs", []):
- url = storage.pull(output_path, command_hash=cmd_hash)
- logger.debug(
- "URL: %s for %s with command hash %s",
- url,
- output_path,
- cmd_hash,
- )
- yield url, output_path
-
- out_locs = [project_dir / out for out in cmd.get("outputs", [])]
- if all(loc.exists() for loc in out_locs):
- update_lockfile(project_dir, cmd)
- # We remove the command from the list here, and break, so that
- # we iterate over the loop again.
- commands.pop(i)
- break
- else:
- logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
- else:
- # If we didn't break the for loop, break the while loop.
- break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
deleted file mode 100644
index a7915e547..000000000
--- a/spacy/cli/project/push.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from pathlib import Path
-
-from wasabi import msg
-
-from .._util import Arg, load_project_config, logger, project_cli
-from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
-
-
-@project_cli.command("push")
-def project_push_cli(
- # fmt: off
- remote: str = Arg("default", help="Name or path of remote storage"),
- project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
- # fmt: on
-):
- """Persist outputs to a remote storage. You can alias remotes in your
- project.yml by mapping them to storage paths. A storage can be anything that
- the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
- local directories etc.
-
- DOCS: https://spacy.io/api/cli#project-push
- """
- for output_path, url in project_push(project_dir, remote):
- if url is None:
- msg.info(f"Skipping {output_path}")
- else:
- msg.good(f"Pushed {output_path} to {url}")
-
-
-def project_push(project_dir: Path, remote: str):
- """Persist outputs to a remote storage. You can alias remotes in your project.yml
- by mapping them to storage paths. A storage can be anything that the smart-open
- library can upload to, e.g. gcs, aws, ssh, local directories etc
- """
- config = load_project_config(project_dir)
- if remote in config.get("remotes", {}):
- remote = config["remotes"][remote]
- storage = RemoteStorage(project_dir, remote)
- for cmd in config.get("commands", []):
- logger.debug("CMD: %s", cmd["name"])
- deps = [project_dir / dep for dep in cmd.get("deps", [])]
- if any(not dep.exists() for dep in deps):
- logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
- continue
- cmd_hash = get_command_hash(
- "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
- )
- logger.debug("CMD_HASH: %s", cmd_hash)
- for output_path in cmd.get("outputs", []):
- output_loc = project_dir / output_path
- if output_loc.exists() and _is_not_empty_dir(output_loc):
- url = storage.push(
- output_path,
- command_hash=cmd_hash,
- content_hash=get_content_hash(output_loc),
- )
- logger.debug(
- "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
- )
- yield output_path, url
-
-
-def _is_not_empty_dir(loc: Path):
- if not loc.is_dir():
- return True
- elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
- return True
- else:
- return False
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
deleted file mode 100644
index 84235a90d..000000000
--- a/spacy/cli/project/remote_storage.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import hashlib
-import os
-import site
-import tarfile
-import urllib.parse
-from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Optional
-
-from wasabi import msg
-
-from ... import about
-from ...errors import Errors
-from ...git_info import GIT_VERSION
-from ...util import ENV_VARS, check_bool_env_var, get_minor_version
-from .._util import (
- download_file,
- ensure_pathy,
- get_checksum,
- get_hash,
- make_tempdir,
- upload_file,
-)
-
-if TYPE_CHECKING:
- from pathy import FluidPath # noqa: F401
-
-
-class RemoteStorage:
- """Push and pull outputs to and from a remote file storage.
-
- Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
- ssh, etc.
- """
-
- def __init__(self, project_root: Path, url: str, *, compression="gz"):
- self.root = project_root
- self.url = ensure_pathy(url)
- self.compression = compression
-
- def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
- """Compress a file or directory within a project and upload it to a remote
- storage. If an object exists at the full URL, nothing is done.
-
- Within the remote storage, files are addressed by their project path
- (url encoded) and two user-supplied hashes, representing their creation
- context and their file contents. If the URL already exists, the data is
- not uploaded. Paths are archived and compressed prior to upload.
- """
- loc = self.root / path
- if not loc.exists():
- raise IOError(f"Cannot push {loc}: does not exist.")
- url = self.make_url(path, command_hash, content_hash)
- if url.exists():
- return url
- tmp: Path
- with make_tempdir() as tmp:
- tar_loc = tmp / self.encode_name(str(path))
- mode_string = f"w:{self.compression}" if self.compression else "w"
- with tarfile.open(tar_loc, mode=mode_string) as tar_file:
- tar_file.add(str(loc), arcname=str(path))
- upload_file(tar_loc, url)
- return url
-
- def pull(
- self,
- path: Path,
- *,
- command_hash: Optional[str] = None,
- content_hash: Optional[str] = None,
- ) -> Optional["FluidPath"]:
- """Retrieve a file from the remote cache. If the file already exists,
- nothing is done.
-
- If the command_hash and/or content_hash are specified, only matching
- results are returned. If no results are available, an error is raised.
- """
- dest = self.root / path
- if dest.exists():
- return None
- url = self.find(path, command_hash=command_hash, content_hash=content_hash)
- if url is None:
- return url
- else:
- # Make sure the destination exists
- if not dest.parent.exists():
- dest.parent.mkdir(parents=True)
- tmp: Path
- with make_tempdir() as tmp:
- tar_loc = tmp / url.parts[-1]
- download_file(url, tar_loc)
- mode_string = f"r:{self.compression}" if self.compression else "r"
- with tarfile.open(tar_loc, mode=mode_string) as tar_file:
- # This requires that the path is added correctly, relative
- # to root. This is how we set things up in push()
-
- # Disallow paths outside the current directory for the tar
- # file (CVE-2007-4559, directory traversal vulnerability)
- def is_within_directory(directory, target):
- abs_directory = os.path.abspath(directory)
- abs_target = os.path.abspath(target)
- prefix = os.path.commonprefix([abs_directory, abs_target])
- return prefix == abs_directory
-
- def safe_extract(tar, path):
- for member in tar.getmembers():
- member_path = os.path.join(path, member.name)
- if not is_within_directory(path, member_path):
- raise ValueError(Errors.E852)
- tar.extractall(path)
-
- safe_extract(tar_file, self.root)
- return url
-
- def find(
- self,
- path: Path,
- *,
- command_hash: Optional[str] = None,
- content_hash: Optional[str] = None,
- ) -> Optional["FluidPath"]:
- """Find the best matching version of a file within the storage,
- or `None` if no match can be found. If both the creation and content hash
- are specified, only exact matches will be returned. Otherwise, the most
- recent matching file is preferred.
- """
- name = self.encode_name(str(path))
- urls = []
- if command_hash is not None and content_hash is not None:
- url = self.url / name / command_hash / content_hash
- urls = [url] if url.exists() else []
- elif command_hash is not None:
- if (self.url / name / command_hash).exists():
- urls = list((self.url / name / command_hash).iterdir())
- else:
- if (self.url / name).exists():
- for sub_dir in (self.url / name).iterdir():
- urls.extend(sub_dir.iterdir())
- if content_hash is not None:
- urls = [url for url in urls if url.parts[-1] == content_hash]
- if len(urls) >= 2:
- try:
- urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
- except Exception:
- msg.warn(
- "Unable to sort remote files by last modified. The file(s) "
- "pulled from the cache may not be the most recent."
- )
- return urls[-1] if urls else None
-
- def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
- """Construct a URL from a subpath, a creation hash and a content hash."""
- return self.url / self.encode_name(str(path)) / command_hash / content_hash
-
- def encode_name(self, name: str) -> str:
- """Encode a subpath into a URL-safe name."""
- return urllib.parse.quote_plus(name)
-
-
-def get_content_hash(loc: Path) -> str:
- return get_checksum(loc)
-
-
-def get_command_hash(
- site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
-) -> str:
- """Create a hash representing the execution of a command. This includes the
- currently installed packages, whatever environment variables have been marked
- as relevant, and the command.
- """
- if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
- spacy_v = GIT_VERSION
- else:
- spacy_v = str(get_minor_version(about.__version__) or "")
- dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
- hashes = [spacy_v, site_hash, env_hash] + dep_checksums
- hashes.extend(cmd)
- creation_bytes = "".join(hashes).encode("utf8")
- return hashlib.md5(creation_bytes).hexdigest()
-
-
-def get_site_hash():
- """Hash the current Python environment's site-packages contents, including
- the name and version of the libraries. The list we're hashing is what
- `pip freeze` would output.
- """
- site_dirs = site.getsitepackages()
- if site.ENABLE_USER_SITE:
- site_dirs.extend(site.getusersitepackages())
- packages = set()
- for site_dir in site_dirs:
- site_dir = Path(site_dir)
- for subpath in site_dir.iterdir():
- if subpath.parts[-1].endswith("dist-info"):
- packages.add(subpath.parts[-1].replace(".dist-info", ""))
- package_bytes = "".join(sorted(packages)).encode("utf8")
- return hashlib.md5sum(package_bytes).hexdigest()
-
-
-def get_env_hash(env: Dict[str, str]) -> str:
- """Construct a hash of the environment variables that will be passed into
- the commands.
-
- Values in the env dict may be references to the current os.environ, using
- the syntax $ENV_VAR to mean os.environ[ENV_VAR]
- """
- env_vars = {}
- for key, value in env.items():
- if value.startswith("$"):
- env_vars[key] = os.environ.get(value[1:], "")
- else:
- env_vars[key] = value
- return get_hash(env_vars)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
deleted file mode 100644
index 43972a202..000000000
--- a/spacy/cli/project/run.py
+++ /dev/null
@@ -1,379 +0,0 @@
-import os.path
-import sys
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
-
-import srsly
-import typer
-from wasabi import msg
-from wasabi.util import locale_escape
-
-from ... import about
-from ...git_info import GIT_VERSION
-from ...util import (
- ENV_VARS,
- SimpleFrozenDict,
- SimpleFrozenList,
- check_bool_env_var,
- is_cwd,
- is_minor_version_match,
- join_command,
- run_command,
- split_command,
- working_dir,
-)
-from .._util import (
- COMMAND,
- PROJECT_FILE,
- PROJECT_LOCK,
- Arg,
- Opt,
- get_checksum,
- get_hash,
- load_project_config,
- parse_config_overrides,
- project_cli,
-)
-
-
-@project_cli.command(
- "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
-)
-def project_run_cli(
- # fmt: off
- ctx: typer.Context, # This is only used to read additional arguments
- subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
- project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
- force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
- dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
- show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
- # fmt: on
-):
- """Run a named command or workflow defined in the project.yml. If a workflow
- name is specified, all commands in the workflow are run, in order. If
- commands define dependencies and/or outputs, they will only be re-run if
- state has changed.
-
- DOCS: https://spacy.io/api/cli#project-run
- """
- if show_help or not subcommand:
- print_run_help(project_dir, subcommand)
- else:
- overrides = parse_config_overrides(ctx.args)
- project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
-
-
-def project_run(
- project_dir: Path,
- subcommand: str,
- *,
- overrides: Dict[str, Any] = SimpleFrozenDict(),
- force: bool = False,
- dry: bool = False,
- capture: bool = False,
- skip_requirements_check: bool = False,
-) -> None:
- """Run a named script defined in the project.yml. If the script is part
- of the default pipeline (defined in the "run" section), DVC is used to
- execute the command, so it can determine whether to rerun it. It then
- calls into "exec" to execute it.
-
- project_dir (Path): Path to project directory.
- subcommand (str): Name of command to run.
- overrides (Dict[str, Any]): Optional config overrides.
- force (bool): Force re-running, even if nothing changed.
- dry (bool): Perform a dry run and don't execute commands.
- capture (bool): Whether to capture the output and errors of individual commands.
- If False, the stdout and stderr will not be redirected, and if there's an error,
- sys.exit will be called with the return code. You should use capture=False
- when you want to turn over execution to the command, and capture=True
- when you want to run the command more like a function.
- skip_requirements_check (bool): Whether to skip the requirements check.
- """
- config = load_project_config(project_dir, overrides=overrides)
- commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
- workflows = config.get("workflows", {})
- validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
-
- req_path = project_dir / "requirements.txt"
- if not skip_requirements_check:
- if config.get("check_requirements", True) and os.path.exists(req_path):
- with req_path.open() as requirements_file:
- _check_requirements([req.strip() for req in requirements_file])
-
- if subcommand in workflows:
- msg.info(f"Running workflow '{subcommand}'")
- for cmd in workflows[subcommand]:
- project_run(
- project_dir,
- cmd,
- overrides=overrides,
- force=force,
- dry=dry,
- capture=capture,
- skip_requirements_check=True,
- )
- else:
- cmd = commands[subcommand]
- for dep in cmd.get("deps", []):
- if not (project_dir / dep).exists():
- err = f"Missing dependency specified by command '{subcommand}': {dep}"
- err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
- err_exits = 1 if not dry else None
- msg.fail(err, err_help, exits=err_exits)
- check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
- with working_dir(project_dir) as current_dir:
- msg.divider(subcommand)
- rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
- if not rerun and not force:
- msg.info(f"Skipping '{cmd['name']}': nothing changed")
- else:
- run_commands(cmd["script"], dry=dry, capture=capture)
- if not dry:
- update_lockfile(current_dir, cmd)
-
-
-def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
- """Simulate a CLI help prompt using the info available in the project.yml.
-
- project_dir (Path): The project directory.
- subcommand (Optional[str]): The subcommand or None. If a subcommand is
- provided, the subcommand help is shown. Otherwise, the top-level help
- and a list of available commands is printed.
- """
- config = load_project_config(project_dir)
- config_commands = config.get("commands", [])
- commands = {cmd["name"]: cmd for cmd in config_commands}
- workflows = config.get("workflows", {})
- project_loc = "" if is_cwd(project_dir) else project_dir
- if subcommand:
- validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
- print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
- if subcommand in commands:
- help_text = commands[subcommand].get("help")
- if help_text:
- print(f"\n{help_text}\n")
- elif subcommand in workflows:
- steps = workflows[subcommand]
- print(f"\nWorkflow consisting of {len(steps)} commands:")
- steps_data = [
- (f"{i + 1}. {step}", commands[step].get("help", ""))
- for i, step in enumerate(steps)
- ]
- msg.table(steps_data)
- help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
- print(f"For command details, run: {help_cmd}")
- else:
- print("")
- title = config.get("title")
- if title:
- print(f"{locale_escape(title)}\n")
- if config_commands:
- print(f"Available commands in {PROJECT_FILE}")
- print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
- msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
- if workflows:
- print(f"Available workflows in {PROJECT_FILE}")
- print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
- msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
-
-
-def run_commands(
- commands: Iterable[str] = SimpleFrozenList(),
- silent: bool = False,
- dry: bool = False,
- capture: bool = False,
-) -> None:
- """Run a sequence of commands in a subprocess, in order.
-
- commands (List[str]): The string commands.
- silent (bool): Don't print the commands.
- dry (bool): Perform a dry run and don't execut anything.
- capture (bool): Whether to capture the output and errors of individual commands.
- If False, the stdout and stderr will not be redirected, and if there's an error,
- sys.exit will be called with the return code. You should use capture=False
- when you want to turn over execution to the command, and capture=True
- when you want to run the command more like a function.
- """
- for c in commands:
- command = split_command(c)
- # Not sure if this is needed or a good idea. Motivation: users may often
- # use commands in their config that reference "python" and we want to
- # make sure that it's always executing the same Python that spaCy is
- # executed with and the pip in the same env, not some other Python/pip.
- # Also ensures cross-compatibility if user 1 writes "python3" (because
- # that's how it's set up on their system), and user 2 without the
- # shortcut tries to re-run the command.
- if len(command) and command[0] in ("python", "python3"):
- command[0] = sys.executable
- elif len(command) and command[0] in ("pip", "pip3"):
- command = [sys.executable, "-m", "pip", *command[1:]]
- if not silent:
- print(f"Running command: {join_command(command)}")
- if not dry:
- run_command(command, capture=capture)
-
-
-def validate_subcommand(
- commands: Sequence[str], workflows: Sequence[str], subcommand: str
-) -> None:
- """Check that a subcommand is valid and defined. Raises an error otherwise.
-
- commands (Sequence[str]): The available commands.
- subcommand (str): The subcommand.
- """
- if not commands and not workflows:
- msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
- if subcommand not in commands and subcommand not in workflows:
- help_msg = []
- if subcommand in ["assets", "asset"]:
- help_msg.append("Did you mean to run: python -m spacy project assets?")
- if commands:
- help_msg.append(f"Available commands: {', '.join(commands)}")
- if workflows:
- help_msg.append(f"Available workflows: {', '.join(workflows)}")
- msg.fail(
- f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
- ". ".join(help_msg),
- exits=1,
- )
-
-
-def check_rerun(
- project_dir: Path,
- command: Dict[str, Any],
- *,
- check_spacy_version: bool = True,
- check_spacy_commit: bool = False,
-) -> bool:
- """Check if a command should be rerun because its settings or inputs/outputs
- changed.
-
- project_dir (Path): The current project directory.
- command (Dict[str, Any]): The command, as defined in the project.yml.
- strict_version (bool):
- RETURNS (bool): Whether to re-run the command.
- """
- # Always rerun if no-skip is set
- if command.get("no_skip", False):
- return True
- lock_path = project_dir / PROJECT_LOCK
- if not lock_path.exists(): # We don't have a lockfile, run command
- return True
- data = srsly.read_yaml(lock_path)
- if command["name"] not in data: # We don't have info about this command
- return True
- entry = data[command["name"]]
- # Always run commands with no outputs (otherwise they'd always be skipped)
- if not entry.get("outs", []):
- return True
- # Always rerun if spaCy version or commit hash changed
- spacy_v = entry.get("spacy_version")
- commit = entry.get("spacy_git_version")
- if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
- info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
- msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
- return True
- if check_spacy_commit and commit != GIT_VERSION:
- info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
- msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
- return True
- # If the entry in the lockfile matches the lockfile entry that would be
- # generated from the current command, we don't rerun because it means that
- # all inputs/outputs, hashes and scripts are the same and nothing changed
- lock_entry = get_lock_entry(project_dir, command)
- exclude = ["spacy_version", "spacy_git_version"]
- return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
-
-
-def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
- """Update the lockfile after running a command. Will create a lockfile if
- it doesn't yet exist and will add an entry for the current command, its
- script and dependencies/outputs.
-
- project_dir (Path): The current project directory.
- command (Dict[str, Any]): The command, as defined in the project.yml.
- """
- lock_path = project_dir / PROJECT_LOCK
- if not lock_path.exists():
- srsly.write_yaml(lock_path, {})
- data = {}
- else:
- data = srsly.read_yaml(lock_path)
- data[command["name"]] = get_lock_entry(project_dir, command)
- srsly.write_yaml(lock_path, data)
-
-
-def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
- """Get a lockfile entry for a given command. An entry includes the command,
- the script (command steps) and a list of dependencies and outputs with
- their paths and file hashes, if available. The format is based on the
- dvc.lock files, to keep things consistent.
-
- project_dir (Path): The current project directory.
- command (Dict[str, Any]): The command, as defined in the project.yml.
- RETURNS (Dict[str, Any]): The lockfile entry.
- """
- deps = get_fileinfo(project_dir, command.get("deps", []))
- outs = get_fileinfo(project_dir, command.get("outputs", []))
- outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
- return {
- "cmd": f"{COMMAND} run {command['name']}",
- "script": command["script"],
- "deps": deps,
- "outs": [*outs, *outs_nc],
- "spacy_version": about.__version__,
- "spacy_git_version": GIT_VERSION,
- }
-
-
-def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
- """Generate the file information for a list of paths (dependencies, outputs).
- Includes the file path and the file's checksum.
-
- project_dir (Path): The current project directory.
- paths (List[str]): The file paths.
- RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
- """
- data = []
- for path in paths:
- file_path = project_dir / path
- md5 = get_checksum(file_path) if file_path.exists() else None
- data.append({"path": path, "md5": md5})
- return data
-
-
-def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
- """Checks whether requirements are installed and free of version conflicts.
- requirements (List[str]): List of requirements.
- RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
- exist.
- """
- import pkg_resources
-
- failed_pkgs_msgs: List[str] = []
- conflicting_pkgs_msgs: List[str] = []
-
- for req in requirements:
- try:
- pkg_resources.require(req)
- except pkg_resources.DistributionNotFound as dnf:
- failed_pkgs_msgs.append(dnf.report())
- except pkg_resources.VersionConflict as vc:
- conflicting_pkgs_msgs.append(vc.report())
- except Exception:
- msg.warn(
- f"Unable to check requirement: {req} "
- "Checks are currently limited to requirement specifiers "
- "(PEP 508)"
- )
-
- if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
- msg.warn(
- title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
- "correctly and you installed all requirements specified in your project's requirements.txt: "
- )
- for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
- msg.text(pgk_msg)
-
- return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 8bdabd39c..c72e13b26 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -47,7 +47,8 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train
"""
- util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+ if verbose:
+ util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 694fb732f..b005eef40 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -26,6 +26,9 @@ batch_size = 1000
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
+[nlp.vectors]
+@vectors = "spacy.Vectors.v1"
+
# The pipeline components and their models
[components]
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 86869e3b8..2ab41ccc2 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -1,4 +1,3 @@
-import itertools
import uuid
from typing import Any, Dict, List, Optional, Tuple, Union
@@ -218,7 +217,7 @@ class SpanRenderer:
+ (self.offset_step * (len(entities) - 1))
)
markup += self.span_template.format(
- text=token["text"],
+ text=escape_html(token["text"]),
span_slices=slices,
span_starts=starts,
total_height=total_height,
@@ -314,6 +313,8 @@ class DependencyRenderer:
self.lang = settings.get("lang", DEFAULT_LANG)
render_id = f"{id_prefix}-{i}"
svg = self.render_svg(render_id, p["words"], p["arcs"])
+ if p.get("title"):
+ svg = TPL_TITLE.format(title=p.get("title")) + svg
rendered.append(svg)
if page:
content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])
@@ -566,7 +567,7 @@ class EntityRenderer:
for i, fragment in enumerate(fragments):
markup += escape_html(fragment)
if len(fragments) > 1 and i != len(fragments) - 1:
- markup += ""
+ markup += " "
if self.ents is None or label.upper() in self.ents:
color = self.colors.get(label.upper(), self.default_color)
ent_settings = {
@@ -584,7 +585,7 @@ class EntityRenderer:
for i, fragment in enumerate(fragments):
markup += escape_html(fragment)
if len(fragments) > 1 and i != len(fragments) - 1:
- markup += ""
+ markup += " "
markup = TPL_ENTS.format(content=markup, dir=self.direction)
if title:
markup = TPL_TITLE.format(title=title) + markup
diff --git a/spacy/errors.py b/spacy/errors.py
index db1a886aa..dac07f804 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -219,6 +219,7 @@ class Warnings(metaclass=ErrorsWithCodes):
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'")
+ W126 = ("These keys are unsupported: {unsupported}")
class Errors(metaclass=ErrorsWithCodes):
@@ -553,12 +554,12 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'")
# New errors added in v3.x
+ E849 = ("The vocab only supports {method} for vectors of type "
+ "spacy.vectors.Vectors, not {vectors_type}.")
E850 = ("The PretrainVectors objective currently only supports default or "
"floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.")
- E852 = ("The tar file pulled from the remote attempted an unsafe path "
- "traversal.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
"not permitted in factory names.")
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
@@ -981,6 +982,8 @@ class Errors(metaclass=ErrorsWithCodes):
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")
+ E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
+ "but only callbacks with one or three parameters are supported")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
index 9fc4c4e9d..80fcbc459 100644
--- a/spacy/kb/candidate.pxd
+++ b/spacy/kb/candidate.pxd
@@ -4,7 +4,8 @@ from ..typedefs cimport hash_t
from .kb cimport KnowledgeBase
-# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+# Object used by the Entity Linker that summarizes one entity-alias candidate
+# combination.
cdef class Candidate:
cdef readonly KnowledgeBase kb
cdef hash_t entity_hash
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
index 4cd734f43..4369676e2 100644
--- a/spacy/kb/candidate.pyx
+++ b/spacy/kb/candidate.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
from typing import Iterable
@@ -8,15 +8,24 @@ from ..tokens import Span
cdef class Candidate:
- """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
- to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
- algorithm which will disambiguate the various candidates to the correct one.
+ """A `Candidate` object refers to a textual mention (`alias`) that may or
+ may not be resolved to a specific `entity` from a Knowledge Base. This
+ will be used as input for the entity linking algorithm which will
+ disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
- def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+ def __init__(
+ self,
+ KnowledgeBase kb,
+ entity_hash,
+ entity_freq,
+ entity_vector,
+ alias_hash,
+ prior_prob
+ ):
self.kb = kb
self.entity_hash = entity_hash
self.entity_freq = entity_freq
@@ -59,7 +68,8 @@ cdef class Candidate:
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
"""
- Return candidate entities for a given mention and fetching appropriate entries from the index.
+ Return candidate entities for a given mention and fetching appropriate
+ entries from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
@@ -67,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
return kb.get_candidates(mention)
-def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+def get_candidates_batch(
+ kb: KnowledgeBase, mentions: Iterable[Span]
+) -> Iterable[Iterable[Candidate]]:
"""
- Return candidate entities for the given mentions and fetching appropriate entries from the index.
+ Return candidate entities for the given mentions and fetching appropriate entries
+ from the index.
kb (KnowledgeBase): Knowledge base to query.
mention (Iterable[Span]): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index a88e18e1f..c7db34e16 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
from pathlib import Path
from typing import Iterable, Tuple, Union
@@ -12,8 +12,9 @@ from .candidate import Candidate
cdef class KnowledgeBase:
- """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
- to support entity linking of named entities to real-world concepts.
+ """A `KnowledgeBase` instance stores unique identifiers for entities and
+ their textual aliases, to support entity linking of named entities to
+ real-world concepts.
This is an abstract class and requires its operations to be implemented.
DOCS: https://spacy.io/api/kb
@@ -31,10 +32,13 @@ cdef class KnowledgeBase:
self.entity_vector_length = entity_vector_length
self.mem = Pool()
- def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+ def get_candidates_batch(
+ self, mentions: Iterable[Span]
+ ) -> Iterable[Iterable[Candidate]]:
"""
- Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
- and the prior probability of that alias resolving to that entity.
+ Return candidate entities for specified texts. Each candidate defines
+ the entity, the original alias, and the prior probability of that
+ alias resolving to that entity.
If no candidate is found for a given text, an empty list is returned.
mentions (Iterable[Span]): Mentions for which to get candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
@@ -43,14 +47,17 @@ cdef class KnowledgeBase:
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
"""
- Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+ Return candidate entities for specified text. Each candidate defines
+ the entity, the original alias,
and the prior probability of that alias resolving to that entity.
If the no candidate is found for a given text, an empty list is returned.
mention (Span): Mention for which to get candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
raise NotImplementedError(
- Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
+ Errors.E1045.format(
+ parent="KnowledgeBase", method="get_candidates", name=self.__name__
+ )
)
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
@@ -68,7 +75,9 @@ cdef class KnowledgeBase:
RETURNS (Iterable[float]): Vector for specified entity.
"""
raise NotImplementedError(
- Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
+ Errors.E1045.format(
+ parent="KnowledgeBase", method="get_vector", name=self.__name__
+ )
)
def to_bytes(self, **kwargs) -> bytes:
@@ -76,7 +85,9 @@ cdef class KnowledgeBase:
RETURNS (bytes): Current state as binary string.
"""
raise NotImplementedError(
- Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
+ Errors.E1045.format(
+ parent="KnowledgeBase", method="to_bytes", name=self.__name__
+ )
)
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
@@ -85,25 +96,35 @@ cdef class KnowledgeBase:
exclude (Tuple[str]): Properties to exclude when restoring KB.
"""
raise NotImplementedError(
- Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
+ Errors.E1045.format(
+ parent="KnowledgeBase", method="from_bytes", name=self.__name__
+ )
)
- def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+ def to_disk(
+ self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
+ ) -> None:
"""
Write KnowledgeBase content to disk.
path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude.
"""
raise NotImplementedError(
- Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
+ Errors.E1045.format(
+ parent="KnowledgeBase", method="to_disk", name=self.__name__
+ )
)
- def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+ def from_disk(
+ self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
+ ) -> None:
"""
Load KnowledgeBase content from disk.
path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude.
"""
raise NotImplementedError(
- Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
+ Errors.E1045.format(
+ parent="KnowledgeBase", method="from_disk", name=self.__name__
+ )
)
diff --git a/spacy/kb/kb_in_memory.pxd b/spacy/kb/kb_in_memory.pxd
index 08ec6b2a3..e0e33301a 100644
--- a/spacy/kb/kb_in_memory.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# optional data, we can let users configure a DB as the backend for this.
cdef object _features_table
-
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
"""Add an entity vector to the vectors table."""
cdef int64_t new_index = self._vectors_table.size()
self._vectors_table.push_back(entity_vector)
return new_index
-
- cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
- int32_t vector_index, int feats_row) nogil:
+ cdef inline int64_t c_add_entity(
+ self,
+ hash_t entity_hash,
+ float freq,
+ int32_t vector_index,
+ int feats_row
+ ) nogil:
"""Add an entry to the vector of entries.
- After calling this method, make sure to update also the _entry_index using the return value"""
+ After calling this method, make sure to update also the _entry_index
+ using the return value"""
# This is what we'll map the entity hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later.
cdef int64_t new_index = self._entries.size()
- # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
+ # Avoid struct initializer to enable nogil, cf.
+ # https://github.com/cython/cython/issues/1642
cdef KBEntryC entry
entry.entity_hash = entity_hash
entry.vector_index = vector_index
@@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._entries.push_back(entry)
return new_index
- cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
- """Connect a mention to a list of potential entities with their prior probabilities .
- After calling this method, make sure to update also the _alias_index using the return value"""
- # This is what we'll map the alias hash key to. It's where the alias will be defined
- # in the vector of aliases.
+ cdef inline int64_t c_add_aliases(
+ self,
+ hash_t alias_hash,
+ vector[int64_t] entry_indices,
+ vector[float] probs
+ ) nogil:
+ """Connect a mention to a list of potential entities with their prior
+ probabilities. After calling this method, make sure to update also the
+ _alias_index using the return value"""
+ # This is what we'll map the alias hash key to. It's where the alias will be
+ # defined in the vector of aliases.
cdef int64_t new_index = self._aliases_table.size()
# Avoid struct initializer to enable nogil
@@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
"""
- Initializing the vectors and making sure the first element of each vector is a dummy,
- because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
+ Initializing the vectors and making sure the first element of each vector is a
+ dummy, because the PreshMap maps pointing to indices in these vectors can not
+ contain 0 as value.
cf. https://github.com/explosion/preshed/issues/17
"""
cdef int32_t dummy_value = 0
@@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
cdef class Writer:
cdef FILE* _fp
- cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
+ cdef int write_header(
+ self, int64_t nr_entries, int64_t entity_vector_length
+ ) except -1
cdef int write_vector_element(self, float element) except -1
- cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
+ cdef int write_entry(
+ self, hash_t entry_hash, float entry_freq, int32_t vector_index
+ ) except -1
cdef int write_alias_length(self, int64_t alias_length) except -1
- cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
+ cdef int write_alias_header(
+ self, hash_t alias_hash, int64_t candidate_length
+ ) except -1
cdef int write_alias(self, int64_t entry_index, float prob) except -1
cdef int _write(self, void* value, size_t size) except -1
@@ -143,12 +161,18 @@ cdef class Writer:
cdef class Reader:
cdef FILE* _fp
- cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
+ cdef int read_header(
+ self, int64_t* nr_entries, int64_t* entity_vector_length
+ ) except -1
cdef int read_vector_element(self, float* element) except -1
- cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
+ cdef int read_entry(
+ self, hash_t* entity_hash, float* freq, int32_t* vector_index
+ ) except -1
cdef int read_alias_length(self, int64_t* alias_length) except -1
- cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
+ cdef int read_alias_header(
+ self, hash_t* alias_hash, int64_t* candidate_length
+ ) except -1
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index e991f7720..2b21f246a 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,5 +1,5 @@
-# cython: infer_types=True, profile=True
-from typing import Any, Callable, Dict, Iterable, Union
+# cython: infer_types=True
+from typing import Any, Callable, Dict, Iterable
import srsly
@@ -27,8 +27,9 @@ from .candidate import Candidate as Candidate
cdef class InMemoryLookupKB(KnowledgeBase):
- """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
- to support entity linking of named entities to real-world concepts.
+ """An `InMemoryLookupKB` instance stores unique identifiers for entities
+ and their textual aliases, to support entity linking of named entities to
+ real-world concepts.
DOCS: https://spacy.io/api/inmemorylookupkb
"""
@@ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
def add_entity(self, str entity, float freq, vector[float] entity_vector):
"""
- Add an entity to the KB, optionally specifying its log probability based on corpus frequency
+ Add an entity to the KB, optionally specifying its log probability
+ based on corpus frequency.
Return the hash of the entity ID/name at the end.
"""
cdef hash_t entity_hash = self.vocab.strings.add(entity)
@@ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# Raise an error if the provided entity vector is not of the correct length
if len(entity_vector) != self.entity_vector_length:
- raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
+ raise ValueError(
+ Errors.E141.format(
+ found=len(entity_vector), required=self.entity_vector_length
+ )
+ )
vector_index = self.c_add_vector(entity_vector=entity_vector)
- new_index = self.c_add_entity(entity_hash=entity_hash,
- freq=freq,
- vector_index=vector_index,
- feats_row=-1) # Features table currently not implemented
+ new_index = self.c_add_entity(
+ entity_hash=entity_hash,
+ freq=freq,
+ vector_index=vector_index,
+ feats_row=-1
+ ) # Features table currently not implemented
self._entry_index[entity_hash] = new_index
return entity_hash
@@ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
else:
entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length:
- raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
+ raise ValueError(
+ Errors.E141.format(
+ found=len(entity_vector),
+ required=self.entity_vector_length
+ )
+ )
entry.entity_hash = entity_hash
entry.freq = freq_list[i]
@@ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities):
- raise ValueError(Errors.E132.format(alias=alias,
- entities_length=len(entities),
- probabilities_length=len(probabilities)))
+ raise ValueError(
+ Errors.E132.format(
+ alias=alias,
+ entities_length=len(entities),
+ probabilities_length=len(probabilities))
+ )
- # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
+ # Throw an error if the probabilities sum up to more than 1 (allow for
+ # some rounding errors)
prob_sum = sum(probabilities)
if prob_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
@@ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
for entity, prob in zip(entities, probabilities):
entity_hash = self.vocab.strings[entity]
- if not entity_hash in self._entry_index:
+ if entity_hash not in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity))
entry_index = self._entry_index.get(entity_hash)
entry_indices.push_back(int(entry_index))
probs.push_back(float(prob))
- new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
+ new_index = self.c_add_aliases(
+ alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
+ )
self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash
- def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
+ def append_alias(
+ self, str alias, str entity, float prior_prob, ignore_warnings=False
+ ):
"""
- For an alias already existing in the KB, extend its potential entities with one more.
+ For an alias already existing in the KB, extend its potential entities
+ with one more.
Throw a warning if either the alias or the entity is unknown,
or when the combination is already previously recorded.
Throw an error if this entity+prior prob would exceed the sum of 1.
- For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
+ For efficiency, it's best to use the method `add_alias` as much as
+ possible instead of this one.
"""
# Check if the alias exists in the KB
cdef hash_t alias_hash = self.vocab.strings[alias]
- if not alias_hash in self._alias_index:
+ if alias_hash not in self._alias_index:
raise ValueError(Errors.E176.format(alias=alias))
# Check if the entity exists in the KB
cdef hash_t entity_hash = self.vocab.strings[entity]
- if not entity_hash in self._entry_index:
+ if entity_hash not in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity))
entry_index = self._entry_index.get(entity_hash)
- # Throw an error if the prior probabilities (including the new one) sum up to more than 1
+ # Throw an error if the prior probabilities (including the new one)
+ # sum up to more than 1
alias_index = self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index]
current_sum = sum([p for p in alias_entry.probs])
@@ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
"""
- Return candidate entities for an alias. Each candidate defines the entity, the original alias,
- and the prior probability of that alias resolving to that entity.
+ Return candidate entities for an alias. Each candidate defines the
+ entity, the original alias, and the prior probability of that alias
+ resolving to that entity.
If the alias is not known in the KB, and empty list is returned.
"""
cdef hash_t alias_hash = self.vocab.strings[alias]
- if not alias_hash in self._alias_index:
+ if alias_hash not in self._alias_index:
return []
alias_index = self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index]
@@ -249,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq,
- entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
+ entity_vector=self._vectors_table[
+ self._entries[entry_index].vector_index
+ ],
alias_hash=alias_hash,
prior_prob=prior_prob)
- for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
+ for (entry_index, prior_prob) in zip(
+ alias_entry.entry_indices, alias_entry.probs
+ )
if entry_index != 0]
def get_vector(self, str entity):
@@ -266,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return self._vectors_table[self._entries[entry_index].vector_index]
def get_prior_prob(self, str entity, str alias):
- """ Return the prior probability of a given alias being linked to a given entity,
- or return 0.0 when this combination is not known in the knowledge base"""
+ """ Return the prior probability of a given alias being linked to a
+ given entity, or return 0.0 when this combination is not known in the
+ knowledge base."""
cdef hash_t alias_hash = self.vocab.strings[alias]
cdef hash_t entity_hash = self.vocab.strings[entity]
@@ -278,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
entry_index = self._entry_index[entity_hash]
alias_entry = self._aliases_table[alias_index]
- for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
+ for (entry_index, prior_prob) in zip(
+ alias_entry.entry_indices, alias_entry.probs
+ ):
if self._entries[entry_index].entity_hash == entity_hash:
return prior_prob
@@ -288,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""Serialize the current state to a binary string.
"""
def serialize_header():
- header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
+ header = (
+ self.get_size_entities(),
+ self.get_size_aliases(),
+ self.entity_vector_length
+ )
return srsly.json_dumps(header)
def serialize_entries():
i = 1
tuples = []
- for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
+ for entry_hash, entry_index in sorted(
+ self._entry_index.items(), key=lambda x: x[1]
+ ):
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
@@ -307,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
headers = []
indices_lists = []
probs_lists = []
- for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+ for alias_hash, alias_index in sorted(
+ self._alias_index.items(), key=lambda x: x[1]
+ ):
alias = self._aliases_table[alias_index]
assert alias_index == i
candidate_length = len(alias.entry_indices)
@@ -365,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
indices = srsly.json_loads(all_data[1])
probs = srsly.json_loads(all_data[2])
for header, indices, probs in zip(headers, indices, probs):
- alias_hash, candidate_length = header
+ alias_hash, _candidate_length = header
alias.entry_indices = indices
alias.probs = probs
self._aliases_table[i] = alias
@@ -414,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
writer.write_vector_element(element)
i = i+1
- # dumping the entry records in the order in which they are in the _entries vector.
- # index 0 is a dummy object not stored in the _entry_index and can be ignored.
+ # dumping the entry records in the order in which they are in the
+ # _entries vector.
+ # index 0 is a dummy object not stored in the _entry_index and can
+ # be ignored.
i = 1
- for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
+ for entry_hash, entry_index in sorted(
+ self._entry_index.items(), key=lambda x: x[1]
+ ):
entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash
assert entry_index == i
@@ -429,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# dumping the aliases in the order in which they are in the _alias_index vector.
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
i = 1
- for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
+ for alias_hash, alias_index in sorted(
+ self._alias_index.items(), key=lambda x: x[1]
+ ):
alias = self._aliases_table[alias_index]
assert alias_index == i
@@ -535,7 +581,8 @@ cdef class Writer:
def __init__(self, path):
assert isinstance(path, Path)
content = bytes(path)
- cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
+ cdef bytes bytes_loc = content.encode('utf8') \
+ if type(content) == str else content
self._fp = fopen(bytes_loc, 'wb')
if not self._fp:
raise IOError(Errors.E146.format(path=path))
@@ -545,14 +592,18 @@ cdef class Writer:
cdef size_t status = fclose(self._fp)
assert status == 0
- cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
+ cdef int write_header(
+ self, int64_t nr_entries, int64_t entity_vector_length
+ ) except -1:
self._write(&nr_entries, sizeof(nr_entries))
self._write(&entity_vector_length, sizeof(entity_vector_length))
cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element))
- cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
+ cdef int write_entry(
+ self, hash_t entry_hash, float entry_freq, int32_t vector_index
+ ) except -1:
self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index))
@@ -561,7 +612,9 @@ cdef class Writer:
cdef int write_alias_length(self, int64_t alias_length) except -1:
self._write(&alias_length, sizeof(alias_length))
- cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
+ cdef int write_alias_header(
+ self, hash_t alias_hash, int64_t candidate_length
+ ) except -1:
self._write(&alias_hash, sizeof(alias_hash))
self._write(&candidate_length, sizeof(candidate_length))
@@ -577,16 +630,19 @@ cdef class Writer:
cdef class Reader:
def __init__(self, path):
content = bytes(path)
- cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
+ cdef bytes bytes_loc = content.encode('utf8') \
+ if type(content) == str else content
self._fp = fopen(bytes_loc, 'rb')
if not self._fp:
PyErr_SetFromErrno(IOError)
- status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
+ fseek(self._fp, 0, 0) # this can be 0 if there is no header
def __dealloc__(self):
fclose(self._fp)
- cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
+ cdef int read_header(
+ self, int64_t* nr_entries, int64_t* entity_vector_length
+ ) except -1:
status = self._read(nr_entries, sizeof(int64_t))
if status < 1:
if feof(self._fp):
@@ -606,7 +662,9 @@ cdef class Reader:
return 0 # end of file
raise IOError(Errors.E145.format(param="vector element"))
- cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
+ cdef int read_entry(
+ self, hash_t* entity_hash, float* freq, int32_t* vector_index
+ ) except -1:
status = self._read(entity_hash, sizeof(hash_t))
if status < 1:
if feof(self._fp):
@@ -637,7 +695,9 @@ cdef class Reader:
return 0 # end of file
raise IOError(Errors.E145.format(param="alias length"))
- cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
+ cdef int read_alias_header(
+ self, hash_t* alias_hash, int64_t* candidate_length
+ ) except -1:
status = self._read(alias_hash, sizeof(hash_t))
if status < 1:
if feof(self._fp):
diff --git a/spacy/lang/es/lemmatizer.py b/spacy/lang/es/lemmatizer.py
index 44f968347..ee5d38e84 100644
--- a/spacy/lang/es/lemmatizer.py
+++ b/spacy/lang/es/lemmatizer.py
@@ -163,7 +163,7 @@ class SpanishLemmatizer(Lemmatizer):
for old, new in self.lookups.get_table("lemma_rules").get("det", []):
if word == old:
return [new]
- # If none of the specfic rules apply, search in the common rules for
+ # If none of the specific rules apply, search in the common rules for
# determiners and pronouns that follow a unique pattern for
# lemmatization. If the word is in the list, return the corresponding
# lemma.
@@ -291,7 +291,7 @@ class SpanishLemmatizer(Lemmatizer):
for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
if word == old:
return [new]
- # If none of the specfic rules apply, search in the common rules for
+ # If none of the specific rules apply, search in the common rules for
# determiners and pronouns that follow a unique pattern for
# lemmatization. If the word is in the list, return the corresponding
# lemma.
diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
index 8e9fc8bf2..59037617d 100644
--- a/spacy/lang/grc/punctuation.py
+++ b/spacy/lang/grc/punctuation.py
@@ -15,6 +15,7 @@ _prefixes = (
[
"†",
"⸏",
+ "〈",
]
+ LIST_PUNCT
+ LIST_ELLIPSES
@@ -31,6 +32,7 @@ _suffixes = (
+ [
"†",
"⸎",
+ "〉",
r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
]
)
diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py
index dfb324a4e..c912c950d 100644
--- a/spacy/lang/tr/examples.py
+++ b/spacy/lang/tr/examples.py
@@ -15,4 +15,7 @@ sentences = [
"Türkiye'nin başkenti neresi?",
"Bakanlar Kurulu 180 günlük eylem planını açıkladı.",
"Merkez Bankası, beklentiler doğrultusunda faizlerde değişikliğe gitmedi.",
+ "Cemal Sureya kimdir?",
+ "Bunlari Biliyor muydunuz?",
+ "Altinoluk Turkiye haritasinin neresinde yer alir?",
]
diff --git a/spacy/language.py b/spacy/language.py
index fd616483b..26152b90a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,4 +1,5 @@
import functools
+import inspect
import itertools
import multiprocessing as mp
import random
@@ -64,6 +65,7 @@ from .util import (
registry,
warn_if_jupyter_cupy,
)
+from .vectors import BaseVectors
from .vocab import Vocab, create_vocab
PipeCallable = Callable[[Doc], Doc]
@@ -157,6 +159,7 @@ class Language:
max_length: int = 10**6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
+ create_vectors: Optional[Callable[["Vocab"], BaseVectors]] = None,
batch_size: int = 1000,
**kwargs,
) -> None:
@@ -197,6 +200,10 @@ class Language:
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
+ if not create_vectors:
+ vectors_cfg = {"vectors": self._config["nlp"]["vectors"]}
+ create_vectors = registry.resolve(vectors_cfg)["vectors"]
+ vocab.vectors = create_vectors(vocab)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@@ -1764,6 +1771,10 @@ class Language:
).merge(config)
if "nlp" not in config:
raise ValueError(Errors.E985.format(config=config))
+ # fill in [nlp.vectors] if not present (as a narrower alternative to
+ # auto-filling [nlp] from the default config)
+ if "vectors" not in config["nlp"]:
+ config["nlp"]["vectors"] = {"@vectors": "spacy.Vectors.v1"}
config_lang = config["nlp"].get("lang")
if config_lang is not None and config_lang != cls.lang:
raise ValueError(
@@ -1795,6 +1806,7 @@ class Language:
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
)
create_tokenizer = resolved_nlp["tokenizer"]
+ create_vectors = resolved_nlp["vectors"]
before_creation = resolved_nlp["before_creation"]
after_creation = resolved_nlp["after_creation"]
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
@@ -1815,7 +1827,12 @@ class Language:
# inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk.
- nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
+ nlp = lang_cls(
+ vocab=vocab,
+ create_tokenizer=create_tokenizer,
+ create_vectors=create_vectors,
+ meta=meta,
+ )
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):
@@ -1825,7 +1842,6 @@ class Language:
# Later we replace the component config with the raw config again.
interpolated = filled.interpolate() if not filled.is_interpolated else filled
pipeline = interpolated.get("components", {})
- sourced = util.get_sourced_components(interpolated)
# If components are loaded from a source (existing models), we cache
# them here so they're only loaded once
source_nlps = {}
@@ -1958,7 +1974,7 @@ class Language:
useful when training a pipeline with components sourced from an existing
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
the same tok2vec component, but some of them are frozen and not updated,
- their performance may degrade significally as the tok2vec component is
+ their performance may degrade significantly as the tok2vec component is
updated with new data. To prevent this, listeners can be replaced with
a standalone tok2vec layer that is owned by the component and doesn't
change if the component isn't updated.
@@ -2033,8 +2049,20 @@ class Language:
# Go over the listener layers and replace them
for listener in pipe_listeners:
new_model = tok2vec_model.copy()
- if "replace_listener" in tok2vec_model.attrs:
- new_model = tok2vec_model.attrs["replace_listener"](new_model)
+ replace_listener_func = tok2vec_model.attrs.get("replace_listener")
+ if replace_listener_func is not None:
+ # Pass the extra args to the callback without breaking compatibility with
+ # old library versions that only expect a single parameter.
+ num_params = len(
+ inspect.signature(replace_listener_func).parameters
+ )
+ if num_params == 1:
+ new_model = replace_listener_func(new_model)
+ elif num_params == 3:
+ new_model = replace_listener_func(new_model, listener, tok2vec)
+ else:
+ raise ValueError(Errors.E1055.format(num_params=num_params))
+
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
tok2vec.remove_listener(listener, pipe_name)
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 00e2c6258..f803d5e93 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -1,7 +1,7 @@
# cython: embedsignature=True
+# cython: profile=False
# Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np
-from cython.view cimport array as cvarray
from libc.string cimport memset
np.import_array()
@@ -35,7 +35,7 @@ from .typedefs cimport attr_t, flags_t
from .attrs import intify_attrs
from .errors import Errors, Warnings
-OOV_RANK = 0xffffffffffffffff # UINT64_MAX
+OOV_RANK = 0xffffffffffffffff # UINT64_MAX
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
EMPTY_LEXEME.id = OOV_RANK
@@ -105,7 +105,7 @@ cdef class Lexeme:
if isinstance(value, float):
continue
elif isinstance(value, (int, long)):
- Lexeme.set_struct_attr(self.c, attr, value)
+ Lexeme.set_struct_attr(self.c, attr, value)
else:
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
@@ -137,10 +137,12 @@ cdef class Lexeme:
if hasattr(other, "orth"):
if self.c.orth == other.orth:
return 1.0
- elif hasattr(other, "__len__") and len(other) == 1 \
- and hasattr(other[0], "orth"):
- if self.c.orth == other[0].orth:
- return 1.0
+ elif (
+ hasattr(other, "__len__") and len(other) == 1
+ and hasattr(other[0], "orth")
+ and self.c.orth == other[0].orth
+ ):
+ return 1.0
if self.vector_norm == 0 or other.vector_norm == 0:
warnings.warn(Warnings.W008.format(obj="Lexeme"))
return 0.0
@@ -149,7 +151,7 @@ cdef class Lexeme:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
-
+
@property
def has_vector(self):
"""RETURNS (bool): Whether a word vector is associated with the object.
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index a214c0668..ab5f5d5d1 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: infer_types=True
import warnings
from collections import defaultdict
from itertools import product
@@ -108,7 +108,7 @@ cdef class DependencyMatcher:
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
- return self.has_key(key)
+ return self.has_key(key) # no-cython-lint: W601
def _validate_input(self, pattern, key):
idx = 0
@@ -129,6 +129,7 @@ cdef class DependencyMatcher:
else:
required_keys = {"RIGHT_ID", "RIGHT_ATTRS", "REL_OP", "LEFT_ID"}
relation_keys = set(relation.keys())
+ # Identify required keys that have not been specified
missing = required_keys - relation_keys
if missing:
missing_txt = ", ".join(list(missing))
@@ -136,6 +137,13 @@ cdef class DependencyMatcher:
required=required_keys,
missing=missing_txt
))
+ # Identify additional, unsupported keys
+ unsupported = relation_keys - required_keys
+ if unsupported:
+ unsupported_txt = ", ".join(list(unsupported))
+ warnings.warn(Warnings.W126.format(
+ unsupported=unsupported_txt
+ ))
if (
relation["RIGHT_ID"] in visited_nodes
or relation["LEFT_ID"] not in visited_nodes
@@ -264,7 +272,7 @@ cdef class DependencyMatcher:
def remove(self, key):
key = self._normalize_key(key)
- if not key in self._patterns:
+ if key not in self._patterns:
raise ValueError(Errors.E175.format(key=key))
self._patterns.pop(key)
self._raw_patterns.pop(key)
@@ -382,7 +390,7 @@ cdef class DependencyMatcher:
return []
return [doc[node].head]
- def _gov(self,doc,node):
+ def _gov(self, doc, node):
return list(doc[node].children)
def _dep_chain(self, doc, node):
@@ -443,7 +451,7 @@ cdef class DependencyMatcher:
def _right_child(self, doc, node):
return [child for child in doc[node].rights]
-
+
def _left_child(self, doc, node):
return [child for child in doc[node].lefts]
@@ -461,7 +469,7 @@ cdef class DependencyMatcher:
if doc[node].head.i > node:
return [doc[node].head]
return []
-
+
def _left_parent(self, doc, node):
if doc[node].head.i < node:
return [doc[node].head]
diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx
index e823ce99d..e394f2cf4 100644
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, binding=True, infer_types=True
+# cython: binding=True, infer_types=True
from cpython.object cimport PyObject
from libc.stdint cimport int64_t
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 3d03f37ae..9a9ed4212 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -1,4 +1,4 @@
-# cython: binding=True, infer_types=True, profile=True
+# cython: binding=True, infer_types=True
from typing import Iterable, List
from cymem.cymem cimport Pool
@@ -12,31 +12,18 @@ import warnings
import srsly
-from ..attrs cimport (
- DEP,
- ENT_IOB,
- ID,
- LEMMA,
- MORPH,
- NULL_ATTR,
- ORTH,
- POS,
- TAG,
- attr_id_t,
-)
+from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
from ..structs cimport TokenC
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.morphanalysis cimport MorphAnalysis
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..typedefs cimport attr_t
-from ..vocab cimport Vocab
from ..attrs import IDS
from ..errors import Errors, MatchPatternError, Warnings
from ..schemas import validate_token_pattern
from ..strings import get_string_id
-from ..util import registry
from .levenshtein import levenshtein_compare
DEF PADDING = 5
@@ -87,9 +74,9 @@ cdef class Matcher:
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
- return self.has_key(key)
+ return self.has_key(key) # no-cython-lint: W601
- def add(self, key, patterns, *, on_match=None, greedy: str=None):
+ def add(self, key, patterns, *, on_match=None, greedy: str = None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
@@ -143,8 +130,13 @@ cdef class Matcher:
key = self._normalize_key(key)
for pattern in patterns:
try:
- specs = _preprocess_pattern(pattern, self.vocab,
- self._extensions, self._extra_predicates, self._fuzzy_compare)
+ specs = _preprocess_pattern(
+ pattern,
+ self.vocab,
+ self._extensions,
+ self._extra_predicates,
+ self._fuzzy_compare
+ )
self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs:
for attr, _ in spec[1]:
@@ -168,7 +160,7 @@ cdef class Matcher:
key (str): The ID of the match rule.
"""
norm_key = self._normalize_key(key)
- if not norm_key in self._patterns:
+ if norm_key not in self._patterns:
raise ValueError(Errors.E175.format(key=key))
self._patterns.pop(norm_key)
self._callbacks.pop(norm_key)
@@ -268,8 +260,15 @@ cdef class Matcher:
if self.patterns.empty():
matches = []
else:
- matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
- extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
+ matches = find_matches(
+ &self.patterns[0],
+ self.patterns.size(),
+ doclike,
+ length,
+ extensions=self._extensions,
+ predicates=self._extra_predicates,
+ with_alignments=with_alignments
+ )
final_matches = []
pairs_by_id = {}
# For each key, either add all matches, or only the filtered,
@@ -289,9 +288,9 @@ cdef class Matcher:
memset(matched, 0, length * sizeof(matched[0]))
span_filter = self._filter.get(key)
if span_filter == "FIRST":
- sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
+ sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
elif span_filter == "LONGEST":
- sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
+ sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
else:
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
for match in sorted_pairs:
@@ -366,7 +365,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
cdef vector[MatchC] matches
cdef vector[vector[MatchAlignmentC]] align_states
cdef vector[vector[MatchAlignmentC]] align_matches
- cdef PatternStateC state
cdef int i, j, nr_extra_attr
cdef Pool mem = Pool()
output = []
@@ -388,14 +386,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
value = token.vocab.strings[value]
extra_attr_values[i * nr_extra_attr + index] = value
# Main loop
- cdef int nr_predicate = len(predicates)
for i in range(length):
for j in range(n):
states.push_back(PatternStateC(patterns[j], i, 0))
if with_alignments != 0:
align_states.resize(states.size())
- transition_states(states, matches, align_states, align_matches, predicate_cache,
- doclike[i], extra_attr_values, predicates, with_alignments)
+ transition_states(
+ states,
+ matches,
+ align_states,
+ align_matches,
+ predicate_cache,
+ doclike[i],
+ extra_attr_values,
+ predicates,
+ with_alignments
+ )
extra_attr_values += nr_extra_attr
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
@@ -421,18 +427,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
return output
-cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
- vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
- int8_t* cached_py_predicates,
- Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
+cdef void transition_states(
+ vector[PatternStateC]& states,
+ vector[MatchC]& matches,
+ vector[vector[MatchAlignmentC]]& align_states,
+ vector[vector[MatchAlignmentC]]& align_matches,
+ int8_t* cached_py_predicates,
+ Token token,
+ const attr_t* extra_attrs,
+ py_predicates,
+ bint with_alignments
+) except *:
cdef int q = 0
cdef vector[PatternStateC] new_states
cdef vector[vector[MatchAlignmentC]] align_new_states
- cdef int nr_predicate = len(py_predicates)
for i in range(states.size()):
if states[i].pattern.nr_py >= 1:
- update_predicate_cache(cached_py_predicates,
- states[i].pattern, token, py_predicates)
+ update_predicate_cache(
+ cached_py_predicates,
+ states[i].pattern,
+ token,
+ py_predicates
+ )
action = get_action(states[i], token.c, extra_attrs,
cached_py_predicates)
if action == REJECT:
@@ -468,8 +484,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
align_new_states.push_back(align_states[q])
states[q].pattern += 1
if states[q].pattern.nr_py != 0:
- update_predicate_cache(cached_py_predicates,
- states[q].pattern, token, py_predicates)
+ update_predicate_cache(
+ cached_py_predicates,
+ states[q].pattern,
+ token,
+ py_predicates
+ )
action = get_action(states[q], token.c, extra_attrs,
cached_py_predicates)
# Update alignment before the transition of current state
@@ -485,8 +505,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
ent_id = get_ent_id(state.pattern)
if action == MATCH:
matches.push_back(
- MatchC(pattern_id=ent_id, start=state.start,
- length=state.length+1))
+ MatchC(
+ pattern_id=ent_id,
+ start=state.start,
+ length=state.length+1
+ )
+ )
# `align_matches` always corresponds to `matches` 1:1
if with_alignments != 0:
align_matches.push_back(align_states[q])
@@ -494,23 +518,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
# push match without last token if length > 0
if state.length > 0:
matches.push_back(
- MatchC(pattern_id=ent_id, start=state.start,
- length=state.length))
+ MatchC(
+ pattern_id=ent_id,
+ start=state.start,
+ length=state.length
+ )
+ )
# MATCH_DOUBLE emits matches twice,
# add one more to align_matches in order to keep 1:1 relationship
if with_alignments != 0:
align_matches.push_back(align_states[q])
# push match with last token
matches.push_back(
- MatchC(pattern_id=ent_id, start=state.start,
- length=state.length+1))
+ MatchC(
+ pattern_id=ent_id,
+ start=state.start,
+ length=state.length + 1
+ )
+ )
# `align_matches` always corresponds to `matches` 1:1
if with_alignments != 0:
align_matches.push_back(align_states[q])
elif action == MATCH_REJECT:
matches.push_back(
- MatchC(pattern_id=ent_id, start=state.start,
- length=state.length))
+ MatchC(
+ pattern_id=ent_id,
+ start=state.start,
+ length=state.length
+ )
+ )
# `align_matches` always corresponds to `matches` 1:1
if with_alignments != 0:
align_matches.push_back(align_states[q])
@@ -533,8 +569,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
align_states.push_back(align_new_states[i])
-cdef int update_predicate_cache(int8_t* cache,
- const TokenPatternC* pattern, Token token, predicates) except -1:
+cdef int update_predicate_cache(
+ int8_t* cache,
+ const TokenPatternC* pattern,
+ Token token,
+ predicates
+) except -1:
# If the state references any extra predicates, check whether they match.
# These are cached, so that we don't call these potentially expensive
# Python functions more than we need to.
@@ -580,10 +620,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
else:
state.pattern += 1
-
-cdef action_t get_action(PatternStateC state,
- const TokenC* token, const attr_t* extra_attrs,
- const int8_t* predicate_matches) nogil:
+cdef action_t get_action(
+ PatternStateC state,
+ const TokenC * token,
+ const attr_t * extra_attrs,
+ const int8_t * predicate_matches
+) nogil:
"""We need to consider:
a) Does the token match the specification? [Yes, No]
b) What's the quantifier? [1, 0+, ?]
@@ -649,53 +691,56 @@ cdef action_t get_action(PatternStateC state,
is_match = not is_match
quantifier = ONE
if quantifier == ONE:
- if is_match and is_final:
- # Yes, final: 1000
- return MATCH
- elif is_match and not is_final:
- # Yes, non-final: 0100
- return ADVANCE
- elif not is_match and is_final:
- # No, final: 0000
- return REJECT
- else:
- return REJECT
+ if is_match and is_final:
+ # Yes, final: 1000
+ return MATCH
+ elif is_match and not is_final:
+ # Yes, non-final: 0100
+ return ADVANCE
+ elif not is_match and is_final:
+ # No, final: 0000
+ return REJECT
+ else:
+ return REJECT
elif quantifier == ZERO_PLUS:
- if is_match and is_final:
- # Yes, final: 1001
- return MATCH_EXTEND
- elif is_match and not is_final:
- # Yes, non-final: 0011
- return RETRY_EXTEND
- elif not is_match and is_final:
- # No, final 2000 (note: Don't include last token!)
- return MATCH_REJECT
- else:
- # No, non-final 0010
- return RETRY
+ if is_match and is_final:
+ # Yes, final: 1001
+ return MATCH_EXTEND
+ elif is_match and not is_final:
+ # Yes, non-final: 0011
+ return RETRY_EXTEND
+ elif not is_match and is_final:
+ # No, final 2000 (note: Don't include last token!)
+ return MATCH_REJECT
+ else:
+ # No, non-final 0010
+ return RETRY
elif quantifier == ZERO_ONE:
- if is_match and is_final:
- # Yes, final: 3000
- # To cater for a pattern ending in "?", we need to add
- # a match both with and without the last token
- return MATCH_DOUBLE
- elif is_match and not is_final:
- # Yes, non-final: 0110
- # We need both branches here, consider a pair like:
- # pattern: .?b string: b
- # If we 'ADVANCE' on the .?, we miss the match.
- return RETRY_ADVANCE
- elif not is_match and is_final:
- # No, final 2000 (note: Don't include last token!)
- return MATCH_REJECT
- else:
- # No, non-final 0010
- return RETRY
+ if is_match and is_final:
+ # Yes, final: 3000
+ # To cater for a pattern ending in "?", we need to add
+ # a match both with and without the last token
+ return MATCH_DOUBLE
+ elif is_match and not is_final:
+ # Yes, non-final: 0110
+ # We need both branches here, consider a pair like:
+ # pattern: .?b string: b
+ # If we 'ADVANCE' on the .?, we miss the match.
+ return RETRY_ADVANCE
+ elif not is_match and is_final:
+ # No, final 2000 (note: Don't include last token!)
+ return MATCH_REJECT
+ else:
+ # No, non-final 0010
+ return RETRY
-cdef int8_t get_is_match(PatternStateC state,
- const TokenC* token, const attr_t* extra_attrs,
- const int8_t* predicate_matches) nogil:
+cdef int8_t get_is_match(
+ PatternStateC state,
+ const TokenC* token,
+ const attr_t* extra_attrs,
+ const int8_t* predicate_matches
+) nogil:
for i in range(state.pattern.nr_py):
if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0
@@ -860,7 +905,7 @@ class _FuzzyPredicate:
self.is_extension = is_extension
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
- fuzz = self.predicate[len("FUZZY"):] # number after prefix
+ fuzz = self.predicate[len("FUZZY"):] # number after prefix
self.fuzzy = int(fuzz) if fuzz else -1
self.fuzzy_compare = fuzzy_compare
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
@@ -1082,7 +1127,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
elif cls == _FuzzyPredicate:
if isinstance(value, dict):
# add predicates inside fuzzy operator
- fuzz = type_[len("FUZZY"):] # number after prefix
+ fuzz = type_[len("FUZZY"):] # number after prefix
fuzzy_val = int(fuzz) if fuzz else -1
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
extra_predicates, seen_predicates,
@@ -1101,8 +1146,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
return output
-def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
- seen_predicates):
+def _get_extension_extra_predicates(
+ spec, extra_predicates, predicate_types, seen_predicates
+):
output = []
for attr, value in spec.items():
if isinstance(value, dict):
@@ -1131,7 +1177,7 @@ def _get_operators(spec):
return (ONE,)
elif spec["OP"] in lookup:
return lookup[spec["OP"]]
- #Min_max {n,m}
+ # Min_max {n,m}
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
# {n} --> {n,n} exactly n ONE,(n)
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
@@ -1142,8 +1188,8 @@ def _get_operators(spec):
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
n, m = min_max.split(",")
- #1. Either n or m is a blank string and the other is numeric -->isdigit
- #2. Both are numeric and n <= m
+ # 1. Either n or m is a blank string and the other is numeric -->isdigit
+ # 2. Both are numeric and n <= m
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index c407cf1cc..4efcdb05c 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -1,14 +1,12 @@
-# cython: infer_types=True, profile=True
-from libc.stdint cimport uintptr_t
+# cython: infer_types=True
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
import warnings
-from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
+from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
from ..attrs import IDS
-from ..structs cimport TokenC
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..typedefs cimport attr_t
diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd
index ca31c1699..4d2d7b3fe 100644
--- a/spacy/ml/parser_model.pxd
+++ b/spacy/ml/parser_model.pxd
@@ -40,11 +40,16 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
cdef void free_activations(const ActivationsC* A) nogil
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
- const WeightsC* W, SizesC n) nogil
-
+cdef void predict_states(
+ CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
+) nogil
+
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
-cdef void cpu_log_loss(float* d_scores,
- const float* costs, const int* is_valid, const float* scores, int O) nogil
-
+cdef void cpu_log_loss(
+ float* d_scores,
+ const float* costs,
+ const int* is_valid,
+ const float* scores,
+ int O
+) nogil
diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx
index 5cffc4c2d..f004c562e 100644
--- a/spacy/ml/parser_model.pyx
+++ b/spacy/ml/parser_model.pyx
@@ -1,4 +1,5 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
+# cython: profile=False
cimport numpy as np
from libc.math cimport exp
from libc.stdlib cimport calloc, free, realloc
@@ -8,13 +9,13 @@ from thinc.backends.linalg cimport Vec, VecVec
import numpy
import numpy.random
-from thinc.api import CupyOps, Model, NumpyOps, get_ops
+from thinc.api import CupyOps, Model, NumpyOps
from .. import util
from ..errors import Errors
from ..pipeline._parser_internals.stateclass cimport StateClass
-from ..typedefs cimport class_t, hash_t, weight_t
+from ..typedefs cimport weight_t
cdef WeightsC get_c_weights(model) except *:
@@ -78,33 +79,48 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
A.is_valid = calloc(n.states * n.classes, sizeof(A.is_valid[0]))
A._max_size = n.states
else:
- A.token_ids = realloc(A.token_ids,
- n.states * n.feats * sizeof(A.token_ids[0]))
- A.scores = realloc(A.scores,
- n.states * n.classes * sizeof(A.scores[0]))
- A.unmaxed = realloc(A.unmaxed,
- n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
- A.hiddens = realloc(A.hiddens,
- n.states * n.hiddens * sizeof(A.hiddens[0]))
- A.is_valid = realloc(A.is_valid,
- n.states * n.classes * sizeof(A.is_valid[0]))
+ A.token_ids = realloc(
+ A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
+ )
+ A.scores = realloc(
+ A.scores, n.states * n.classes * sizeof(A.scores[0])
+ )
+ A.unmaxed = realloc(
+ A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
+ )
+ A.hiddens = realloc(
+ A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
+ )
+ A.is_valid = realloc(
+ A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
+ )
A._max_size = n.states
A._curr_size = n.states
-cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
- const WeightsC* W, SizesC n) nogil:
- cdef double one = 1.0
+cdef void predict_states(
+ CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
+) nogil:
resize_activations(A, n)
for i in range(n.states):
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
- sum_state_features(cblas, A.unmaxed,
- W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
+ sum_state_features(
+ cblas,
+ A.unmaxed,
+ W.feat_weights,
+ A.token_ids,
+ n.states,
+ n.feats,
+ n.hiddens * n.pieces
+ )
for i in range(n.states):
- VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
- W.feat_bias, 1., n.hiddens * n.pieces)
+ VecVec.add_i(
+ &A.unmaxed[i*n.hiddens*n.pieces],
+ W.feat_bias, 1.,
+ n.hiddens * n.pieces
+ )
for j in range(n.hiddens):
index = i * n.hiddens * n.pieces + j * n.pieces
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
@@ -114,14 +130,15 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
else:
# Compute hidden-to-output
- sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
+ sgemm(cblas)(
+ False, True, n.states, n.classes, n.hiddens,
1.0, A.hiddens, n.hiddens,
W.hidden_weights, n.hiddens,
- 0.0, A.scores, n.classes)
+ 0.0, A.scores, n.classes
+ )
# Add bias
for i in range(n.states):
- VecVec.add_i(&A.scores[i*n.classes],
- W.hidden_bias, 1., n.classes)
+ VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
# Set unseen classes to minimum value
i = 0
min_ = A.scores[0]
@@ -134,9 +151,16 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
A.scores[i*n.classes+j] = min_
-cdef void sum_state_features(CBlas cblas, float* output,
- const float* cached, const int* token_ids, int B, int F, int O) nogil:
- cdef int idx, b, f, i
+cdef void sum_state_features(
+ CBlas cblas,
+ float* output,
+ const float* cached,
+ const int* token_ids,
+ int B,
+ int F,
+ int O
+) nogil:
+ cdef int idx, b, f
cdef const float* feature
padding = cached
cached += F * O
@@ -153,9 +177,13 @@ cdef void sum_state_features(CBlas cblas, float* output,
token_ids += F
-cdef void cpu_log_loss(float* d_scores,
- const float* costs, const int* is_valid, const float* scores,
- int O) nogil:
+cdef void cpu_log_loss(
+ float* d_scores,
+ const float* costs,
+ const int* is_valid,
+ const float* scores,
+ int O
+) nogil:
"""Do multi-label log loss"""
cdef double max_, gmax, Z, gZ
best = arg_max_if_gold(scores, costs, is_valid, O)
@@ -179,8 +207,9 @@ cdef void cpu_log_loss(float* d_scores,
d_scores[i] = exp(scores[i]-max_) / Z
-cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
- const int* is_valid, int n) nogil:
+cdef int arg_max_if_gold(
+ const weight_t* scores, const weight_t* costs, const int* is_valid, int n
+) nogil:
# Find minimum cost
cdef float cost = 1
for i in range(n):
@@ -204,10 +233,17 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
return best
-
class ParserStepModel(Model):
- def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
- dropout=0.1):
+ def __init__(
+ self,
+ docs,
+ layers,
+ *,
+ has_upper,
+ unseen_classes=None,
+ train=True,
+ dropout=0.1
+ ):
Model.__init__(self, name="parser_step_model", forward=step_forward)
self.attrs["has_upper"] = has_upper
self.attrs["dropout_rate"] = dropout
@@ -268,8 +304,10 @@ class ParserStepModel(Model):
return ids
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
- if isinstance(self.state2vec.ops, CupyOps) \
- and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
+ if (
+ isinstance(self.state2vec.ops, CupyOps)
+ and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
+ ):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((
util.get_async(self.cuda_stream, token_ids),
@@ -279,7 +317,6 @@ class ParserStepModel(Model):
else:
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
-
def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient.
@@ -292,14 +329,15 @@ class ParserStepModel(Model):
ids = ids.flatten()
d_state_features = d_state_features.reshape(
(ids.size, d_state_features.shape[2]))
- self.ops.scatter_add(d_tokvecs, ids,
- d_state_features)
+ self.ops.scatter_add(d_tokvecs, ids, d_state_features)
# Padded -- see update()
self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs
+
NUMPY_OPS = NumpyOps()
+
def step_forward(model: ParserStepModel, states, is_train):
token_ids = model.get_token_ids(states)
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
@@ -312,7 +350,7 @@ def step_forward(model: ParserStepModel, states, is_train):
scores, get_d_vector = model.vec2scores(vector, is_train)
else:
scores = NumpyOps().asarray(vector)
- get_d_vector = lambda d_scores: d_scores
+ get_d_vector = lambda d_scores: d_scores # no-cython-lint: E731
# If the class is unseen, make sure its score is minimum
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
@@ -448,9 +486,11 @@ cdef class precompute_hiddens:
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
- sum_state_features(cblas, state_vector.data,
- feat_weights, &ids[0,0],
- token_ids.shape[0], self.nF, self.nO*self.nP)
+ sum_state_features(
+ cblas, state_vector.data,
+ feat_weights, &ids[0, 0],
+ token_ids.shape[0], self.nF, self.nO*self.nP
+ )
state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
@@ -475,7 +515,7 @@ cdef class precompute_hiddens:
def backprop_maxout(d_best):
return self.ops.backprop_maxout(d_best, mask, self.nP)
-
+
return state_vector, backprop_maxout
def _relu_nonlinearity(self, state_vector):
@@ -489,5 +529,5 @@ cdef class precompute_hiddens:
def backprop_relu(d_best):
d_best *= mask
return d_best.reshape((d_best.shape + (1,)))
-
+
return state_vector, backprop_relu
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index b75240c5d..1a1b0a0ff 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -9,7 +9,7 @@ from thinc.util import partial
from ..attrs import ORTH
from ..errors import Errors, Warnings
from ..tokens import Doc
-from ..vectors import Mode
+from ..vectors import Mode, Vectors
from ..vocab import Vocab
@@ -48,11 +48,14 @@ def forward(
key_attr: int = getattr(vocab.vectors, "attr", ORTH)
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
- if vocab.vectors.mode == Mode.default:
+ if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
V = model.ops.asarray(vocab.vectors.data)
rows = vocab.vectors.find(keys=keys)
V = model.ops.as_contig(V[rows])
- elif vocab.vectors.mode == Mode.floret:
+ elif isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.floret:
+ V = vocab.vectors.get_batch(keys)
+ V = model.ops.as_contig(V)
+ elif hasattr(vocab.vectors, "get_batch"):
V = vocab.vectors.get_batch(keys)
V = model.ops.as_contig(V)
else:
@@ -61,7 +64,7 @@ def forward(
vectors_data = model.ops.gemm(V, W, trans2=True)
except ValueError:
raise RuntimeError(Errors.E896)
- if vocab.vectors.mode == Mode.default:
+ if isinstance(vocab.vectors, Vectors) and vocab.vectors.mode == Mode.default:
# Convert negative indices to 0-vectors
# TODO: more options for UNK tokens
vectors_data[rows < 0] = 0
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 968764b82..ee43aa4ec 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -11,7 +11,7 @@ from .typedefs cimport attr_t, hash_t
cdef class Morphology:
cdef readonly Pool mem
cdef readonly StringStore strings
- cdef PreshMap tags # Keyed by hash, value is pointer to tag
+ cdef PreshMap tags # Keyed by hash, value is pointer to tag
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
cdef int insert(self, MorphAnalysisC tag) except -1
@@ -20,4 +20,8 @@ cdef class Morphology:
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
cdef list list_features(const MorphAnalysisC* morph)
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
+cdef int get_n_by_field(
+ attr_t* results,
+ const MorphAnalysisC* morph,
+ attr_t field,
+) nogil
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 1062fff09..cef45b04d 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,4 +1,5 @@
# cython: infer_types
+# cython: profile=False
import warnings
import numpy
@@ -83,10 +84,11 @@ cdef class Morphology:
features = self.normalize_attrs(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# normalized UFEATS string with sorted fields and values
- norm_feats_string = self.FEATURE_SEP.join(sorted([
- self.FIELD_SEP.join([field, values])
- for field, values in string_features.items()
- ]))
+ norm_feats_string = self.FEATURE_SEP.join(
+ sorted(
+ [self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
+ )
+ )
return norm_feats_string or self.EMPTY_MORPH
def normalize_attrs(self, attrs):
@@ -192,6 +194,7 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
n_results += 1
return n_results
+
def unpickle_morphology(strings, tags):
cdef Morphology morphology = Morphology(strings)
for tag in tags:
diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd
index a0b2567f1..b5423d113 100644
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
ADV
AUX
CONJ
- CCONJ # U20
+ CCONJ # U20
DET
INTJ
NOUN
diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx
index e71fb917f..98e3570ec 100644
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@@ -1,4 +1,4 @@
-
+# cython: profile=False
IDS = {
"": NO_TAG,
"ADJ": ADJ,
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
index 3d63af921..41acd2b07 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pxd
@@ -46,11 +46,18 @@ cdef struct EditTreeC:
bint is_match_node
NodeC inner
-cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
- uint32_t prefix_tree, uint32_t suffix_tree):
- cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
- suffix_len=suffix_len, prefix_tree=prefix_tree,
- suffix_tree=suffix_tree)
+cdef inline EditTreeC edittree_new_match(
+ len_t prefix_len,
+ len_t suffix_len,
+ uint32_t prefix_tree,
+ uint32_t suffix_tree
+):
+ cdef MatchNodeC match_node = MatchNodeC(
+ prefix_len=prefix_len,
+ suffix_len=suffix_len,
+ prefix_tree=prefix_tree,
+ suffix_tree=suffix_tree
+ )
cdef NodeC inner = NodeC(match_node=match_node)
return EditTreeC(is_match_node=True, inner=inner)
diff --git a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
index daab0d204..7abd9f2a6 100644
--- a/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
+++ b/spacy/pipeline/_edit_tree_internals/edit_trees.pyx
@@ -1,12 +1,11 @@
# cython: infer_types=True, binding=True
+# cython: profile=False
from cython.operator cimport dereference as deref
from libc.stdint cimport UINT32_MAX, uint32_t
from libc.string cimport memset
from libcpp.pair cimport pair
from libcpp.vector cimport vector
-from pathlib import Path
-
from ...typedefs cimport hash_t
from ... import util
@@ -25,17 +24,16 @@ cdef LCS find_lcs(str source, str target):
target (str): The second string.
RETURNS (LCS): The spans of the longest common subsequences.
"""
- cdef Py_ssize_t source_len = len(source)
cdef Py_ssize_t target_len = len(target)
- cdef size_t longest_align = 0;
+ cdef size_t longest_align = 0
cdef int source_idx, target_idx
cdef LCS lcs
cdef Py_UCS4 source_cp, target_cp
memset(&lcs, 0, sizeof(lcs))
- cdef vector[size_t] prev_aligns = vector[size_t](target_len);
- cdef vector[size_t] cur_aligns = vector[size_t](target_len);
+ cdef vector[size_t] prev_aligns = vector[size_t](target_len)
+ cdef vector[size_t] cur_aligns = vector[size_t](target_len)
for (source_idx, source_cp) in enumerate(source):
for (target_idx, target_cp) in enumerate(target):
@@ -89,7 +87,7 @@ cdef class EditTrees:
cdef LCS lcs = find_lcs(form, lemma)
cdef EditTreeC tree
- cdef uint32_t tree_id, prefix_tree, suffix_tree
+ cdef uint32_t prefix_tree, suffix_tree
if lcs_is_empty(lcs):
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
else:
@@ -108,7 +106,7 @@ cdef class EditTrees:
return self._tree_id(tree)
cdef uint32_t _tree_id(self, EditTreeC tree):
- # If this tree has been constructed before, return its identifier.
+ # If this tree has been constructed before, return its identifier.
cdef hash_t hash = edittree_hash(tree)
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
if iter != self.map.end():
@@ -289,6 +287,7 @@ def _tree2dict(tree):
tree = tree["inner"]["subst_node"]
return(dict(tree))
+
def _dict2tree(tree):
errors = validate_edit_tree(tree)
if errors:
diff --git a/spacy/pipeline/_edit_tree_internals/schemas.py b/spacy/pipeline/_edit_tree_internals/schemas.py
index 1e307b66c..89f2861ce 100644
--- a/spacy/pipeline/_edit_tree_internals/schemas.py
+++ b/spacy/pipeline/_edit_tree_internals/schemas.py
@@ -1,8 +1,12 @@
from collections import defaultdict
from typing import Any, Dict, List, Union
-from pydantic import BaseModel, Field, ValidationError
-from pydantic.types import StrictBool, StrictInt, StrictStr
+try:
+ from pydantic.v1 import BaseModel, Field, ValidationError
+ from pydantic.v1.types import StrictBool, StrictInt, StrictStr
+except ImportError:
+ from pydantic import BaseModel, Field, ValidationError # type: ignore
+ from pydantic.types import StrictBool, StrictInt, StrictStr # type: ignore
class MatchNodeSchema(BaseModel):
diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx
index 04dd3f11e..ac04be5a7 100644
--- a/spacy/pipeline/_parser_internals/_beam_utils.pyx
+++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx
@@ -1,17 +1,13 @@
# cython: infer_types=True
-# cython: profile=True
-cimport numpy as np
-
import numpy
-from cpython.ref cimport Py_XDECREF, PyObject
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.extra.search cimport MaxViolation
-from ...typedefs cimport class_t, hash_t
+from ...typedefs cimport class_t
from .transition_system cimport Transition, TransitionSystem
from ...errors import Errors
@@ -146,7 +142,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
cdef MaxViolation violn
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
- cdef StateClass state
beam_maps = []
backprops = []
violns = [MaxViolation() for _ in range(len(states))]
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 24acc350c..c063cf97c 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -277,7 +277,6 @@ cdef cppclass StateC:
return n
-
int n_L(int head) nogil const:
return n_arcs(this._left_arcs, head)
diff --git a/spacy/pipeline/_parser_internals/_state.pyx b/spacy/pipeline/_parser_internals/_state.pyx
index e69de29bb..61bf62038 100644
--- a/spacy/pipeline/_parser_internals/_state.pyx
+++ b/spacy/pipeline/_parser_internals/_state.pyx
@@ -0,0 +1 @@
+# cython: profile=False
diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx
index 2c9eb0ff5..e13754944 100644
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, cdivision=True, infer_types=True
+# cython: cdivision=True, infer_types=True
from cymem.cymem cimport Address, Pool
from libc.stdint cimport int32_t
from libcpp.vector cimport vector
@@ -9,7 +9,7 @@ from ...strings cimport hash_string
from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads
from ...tokens.token cimport MISSING_DEP
-from ...typedefs cimport attr_t, hash_t
+from ...typedefs cimport attr_t
from ...training import split_bilu_label
@@ -68,8 +68,9 @@ cdef struct GoldParseStateC:
weight_t pop_cost
-cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
- heads, labels, sent_starts) except *:
+cdef GoldParseStateC create_gold_state(
+ Pool mem, const StateC* state, heads, labels, sent_starts
+) except *:
cdef GoldParseStateC gs
gs.length = len(heads)
gs.stride = 1
@@ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
gs.n_kids_in_stack = mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
for i, is_sent_start in enumerate(sent_starts):
- if is_sent_start == True:
+ if is_sent_start is True:
gs.state_bits[i] = set_state_flag(
gs.state_bits[i],
IS_SENT_START,
@@ -210,6 +211,7 @@ cdef class ArcEagerGold:
def update(self, StateClass stcls):
update_gold_state(&self.c, stcls.c)
+
def _get_aligned_sent_starts(example):
"""Get list of SENT_START attributes aligned to the predicted tokenization.
If the reference has not sentence starts, return a list of None values.
@@ -524,7 +526,6 @@ cdef class Break:
"""
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
- cdef int i
if st.buffer_length() < 2:
return False
elif st.B(1) != st.B(0) + 1:
@@ -556,8 +557,8 @@ cdef class Break:
cost -= 1
if gold.heads[si] == b0:
cost -= 1
- if not is_sent_start(gold, state.B(1)) \
- and not is_sent_start_unknown(gold, state.B(1)):
+ if not is_sent_start(gold, state.B(1)) and\
+ not is_sent_start_unknown(gold, state.B(1)):
cost += 1
return cost
@@ -803,7 +804,6 @@ cdef class ArcEager(TransitionSystem):
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
cdef ArcEagerGold gold_ = gold
gold_state = gold_.c
- n_gold = 0
if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else:
@@ -875,7 +875,7 @@ cdef class ArcEager(TransitionSystem):
print("Gold")
for token in example.y:
print(token.i, token.text, token.dep_, token.head.text)
- aligned_heads, aligned_labels = example.get_aligned_parse()
+ aligned_heads, _aligned_labels = example.get_aligned_parse()
print("Aligned heads")
for i, head in enumerate(aligned_heads):
print(example.x[i], example.x[head] if head is not None else "__")
diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index e1edb4464..e4312bd2f 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,6 +1,4 @@
-import os
-import random
-
+# cython: profile=False
from cymem.cymem cimport Pool
from libc.stdint cimport int32_t
@@ -14,7 +12,7 @@ from ...tokens.span import Span
from ...attrs cimport IS_SPACE
from ...lexeme cimport Lexeme
-from ...structs cimport SpanC, TokenC
+from ...structs cimport SpanC
from ...tokens.span cimport Span
from ...typedefs cimport attr_t, weight_t
@@ -141,11 +139,10 @@ cdef class BiluoPushDown(TransitionSystem):
OUT: Counter()
}
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
- actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
+ actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
for entity_type in kwargs.get('entity_types', []):
for action in (BEGIN, IN, LAST, UNIT):
actions[action][entity_type] = 1
- moves = ('M', 'B', 'I', 'L', 'U')
for example in kwargs.get('examples', []):
for token in example.y:
ent_type = token.ent_type_
@@ -164,7 +161,7 @@ cdef class BiluoPushDown(TransitionSystem):
if token.ent_type:
labels.add(token.ent_type_)
return labels
-
+
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
@@ -325,7 +322,6 @@ cdef class BiluoPushDown(TransitionSystem):
raise TypeError(Errors.E909.format(name="BiluoGold"))
cdef BiluoGold gold_ = gold
gold_state = gold_.c
- n_gold = 0
if self.c[i].is_valid(stcls.c, self.c[i].label):
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
else:
@@ -486,10 +482,8 @@ cdef class In:
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = _gold
- move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
cdef int g_act = gold.ner[s.B(0)].move
- cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING:
@@ -549,12 +543,10 @@ cdef class Last:
@staticmethod
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = _gold
- move = LAST
b0 = s.B(0)
ent_start = s.E(0)
cdef int g_act = gold.ner[b0].move
- cdef attr_t g_tag = gold.ner[b0].label
cdef int cost = 0
@@ -650,7 +642,6 @@ cdef class Unit:
cost += 1
break
return cost
-
cdef class Out:
@@ -675,7 +666,6 @@ cdef class Out:
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
gold = _gold
cdef int g_act = gold.ner[s.B(0)].move
- cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef weight_t cost = 0
if g_act == MISSING:
pass
diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx
index 66f423b3b..7de19851e 100644
--- a/spacy/pipeline/_parser_internals/nonproj.pyx
+++ b/spacy/pipeline/_parser_internals/nonproj.pyx
@@ -1,4 +1,4 @@
-# cython: profile=True, infer_types=True
+# cython: infer_types=True
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
@@ -125,14 +125,17 @@ def decompose(label):
def is_decorated(label):
return DELIMITER in label
+
def count_decorated_labels(gold_data):
freqs = {}
for example in gold_data:
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
example.get_aligned("DEP"))
# set the label to ROOT for each root dependent
- deco_deps = ['ROOT' if head == i else deco_deps[i]
- for i, head in enumerate(proj_heads)]
+ deco_deps = [
+ 'ROOT' if head == i else deco_deps[i]
+ for i, head in enumerate(proj_heads)
+ ]
# count label frequencies
for label in deco_deps:
if is_decorated(label):
@@ -160,9 +163,9 @@ def projectivize(heads, labels):
cdef vector[int] _heads_to_c(heads):
- cdef vector[int] c_heads;
+ cdef vector[int] c_heads
for head in heads:
- if head == None:
+ if head is None:
c_heads.push_back(-1)
else:
assert head < len(heads)
@@ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels):
deco_labels.append(labels[tokenid])
return deco_labels
+
def get_smallest_nonproj_arc_slow(heads):
cdef vector[int] c_heads = _heads_to_c(heads)
return _get_smallest_nonproj_arc(c_heads)
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index 0a2657af1..e3b063b7d 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -1,6 +1,5 @@
# cython: infer_types=True
-import numpy
-
+# cython: profile=False
from libcpp.vector cimport vector
from ...tokens.doc cimport Doc
@@ -38,11 +37,11 @@ cdef class StateClass:
cdef vector[ArcC] arcs
self.c.get_arcs(&arcs)
return list(arcs)
- #py_arcs = []
- #for arc in arcs:
- # if arc.head != -1 and arc.child != -1:
- # py_arcs.append((arc.head, arc.child, arc.label))
- #return arcs
+ # py_arcs = []
+ # for arc in arcs:
+ # if arc.head != -1 and arc.child != -1:
+ # py_arcs.append((arc.head, arc.child, arc.label))
+ # return arcs
def add_arc(self, int head, int child, int label):
self.c.add_arc(head, child, label)
@@ -52,10 +51,10 @@ cdef class StateClass:
def H(self, int child):
return self.c.H(child)
-
+
def L(self, int head, int idx):
return self.c.L(head, idx)
-
+
def R(self, int head, int idx):
return self.c.R(head, idx)
@@ -98,7 +97,7 @@ cdef class StateClass:
def H(self, int i):
return self.c.H(i)
-
+
def E(self, int i):
return self.c.E(i)
@@ -116,7 +115,7 @@ cdef class StateClass:
def H_(self, int i):
return self.doc[self.c.H(i)]
-
+
def E_(self, int i):
return self.doc[self.c.E(i)]
@@ -125,7 +124,7 @@ cdef class StateClass:
def R_(self, int i, int idx):
return self.doc[self.c.R(i, idx)]
-
+
def empty(self):
return self.c.empty()
@@ -134,7 +133,7 @@ cdef class StateClass:
def at_break(self):
return False
- #return self.c.at_break()
+ # return self.c.at_break()
def has_head(self, int i):
return self.c.has_head(i)
diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd
index ce17480d4..04cd10d88 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pxd
+++ b/spacy/pipeline/_parser_internals/transition_system.pxd
@@ -20,11 +20,15 @@ cdef struct Transition:
int (*do)(StateC* state, attr_t label) nogil
-ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold,
- attr_tlabel) nogil
-ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil
-ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void*
- gold, attr_t label) nogil
+ctypedef weight_t (*get_cost_func_t)(
+ const StateC* state, const void* gold, attr_tlabel
+) nogil
+ctypedef weight_t (*move_cost_func_t)(
+ const StateC* state, const void* gold
+) nogil
+ctypedef weight_t (*label_cost_func_t)(
+ const StateC* state, const void* gold, attr_t label
+) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx
index 053c87f22..e035053b3 100644
--- a/spacy/pipeline/_parser_internals/transition_system.pyx
+++ b/spacy/pipeline/_parser_internals/transition_system.pyx
@@ -1,4 +1,5 @@
# cython: infer_types=True
+# cython: profile=False
from __future__ import print_function
from cymem.cymem cimport Pool
@@ -8,9 +9,7 @@ from collections import Counter
import srsly
from ...structs cimport TokenC
-from ...tokens.doc cimport Doc
from ...typedefs cimport attr_t, weight_t
-from . cimport _beam_utils
from .stateclass cimport StateClass
from ... import util
@@ -231,7 +230,6 @@ cdef class TransitionSystem:
return self
def to_bytes(self, exclude=tuple()):
- transitions = []
serializers = {
'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes(),
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index cb896c385..18a220bd6 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -1,6 +1,6 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from collections import defaultdict
-from typing import Callable, Iterable, Optional
+from typing import Callable, Optional
from thinc.api import Config, Model
@@ -124,6 +124,7 @@ def make_parser(
scorer=scorer,
)
+
@Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 4ca0ce165..d415ae43c 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -1,8 +1,7 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from itertools import islice
from typing import Callable, Dict, Optional, Union
-import srsly
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
from ..morphology cimport Morphology
@@ -14,10 +13,8 @@ from ..errors import Errors
from ..language import Language
from ..parts_of_speech import IDS as POS_IDS
from ..scorer import Scorer
-from ..symbols import POS
from ..training import validate_examples, validate_get_examples
from ..util import registry
-from .pipe import deserialize_config
from .tagger import Tagger
# See #9050
@@ -76,8 +73,11 @@ def morphologizer_score(examples, **kwargs):
results = {}
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
- results.update(Scorer.score_token_attr_per_feat(examples,
- "morph", getter=morph_key_getter, **kwargs))
+ results.update(
+ Scorer.score_token_attr_per_feat(
+ examples, "morph", getter=morph_key_getter, **kwargs
+ )
+ )
return results
@@ -233,7 +233,6 @@ class Morphologizer(Tagger):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
- cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
cdef bint extend = self.cfg["extend"]
labels = self.labels
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 6b62c0811..f33a90fde 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -1,16 +1,13 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from typing import Optional
import numpy
from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
-from ..tokens.doc cimport Doc
-
-from ..attrs import ID, POS
+from ..attrs import ID
from ..errors import Errors
from ..language import Language
from ..training import validate_examples
-from ._parser_internals import nonproj
from .tagger import Tagger
from .trainable_pipe import TrainablePipe
@@ -103,10 +100,9 @@ class MultitaskObjective(Tagger):
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
- docs = [eg.predicted for eg in examples]
for i, eg in enumerate(examples):
# Handles alignment for tokenization differences
- doc_annots = eg.get_aligned() # TODO
+ _doc_annots = eg.get_aligned() # TODO
for j in range(len(eg.predicted)):
tok_annots = {key: values[j] for key, values in tok_annots.items()}
label = self.make_label(j, tok_annots)
@@ -206,7 +202,6 @@ class ClozeMultitask(TrainablePipe):
losses[self.name] = 0.
set_dropout_rate(self.model, drop)
validate_examples(examples, "ClozeMultitask.rehearse")
- docs = [eg.predicted for eg in examples]
predictions, bp_predictions = self.model.begin_update()
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions)
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 8dd6c3c43..bb009dc7a 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -1,6 +1,6 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from collections import defaultdict
-from typing import Callable, Iterable, Optional
+from typing import Callable, Optional
from thinc.api import Config, Model
@@ -10,7 +10,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from .transition_parser cimport Parser
from ..language import Language
-from ..scorer import PRFScore, get_ner_prf
+from ..scorer import get_ner_prf
from ..training import remove_bilu_prefix
from ..util import registry
@@ -100,6 +100,7 @@ def make_ner(
scorer=scorer,
)
+
@Language.factory(
"beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 42f518882..72ea7e45a 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -1,6 +1,6 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
import warnings
-from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
+from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
import srsly
@@ -40,7 +40,7 @@ cdef class Pipe:
"""
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
- def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
+ def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
@@ -59,7 +59,7 @@ cdef class Pipe:
except Exception as e:
error_handler(self.name, self, [doc], e)
- def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
+ def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
"""Initialize the pipe. For non-trainable components, this method
is optional. For trainable components, which should inherit
from the subclass TrainablePipe, the provided data examples
diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx
index 2fe7e1540..08ba9d989 100644
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from typing import Callable, List, Optional
import srsly
@@ -7,13 +7,13 @@ from ..tokens.doc cimport Doc
from .. import util
from ..language import Language
-from ..scorer import Scorer
from .pipe import Pipe
from .senter import senter_score
# see #9050
BACKWARD_OVERWRITE = False
+
@Language.factory(
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
@@ -36,17 +36,19 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer
"""
- default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
- '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
- '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
- '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
- '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
- '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
- '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
- '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
- '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
- '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
- '。', '。']
+ default_punct_chars = [
+ '!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
+ '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
+ '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
+ '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
+ '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
+ '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
+ '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
+ '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
+ '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
+ '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
+ '。', '。'
+ ]
def __init__(
self,
@@ -128,7 +130,6 @@ class Sentencizer(Pipe):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
- cdef int idx = 0
for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i]
for j, tag_id in enumerate(doc_tag_ids):
@@ -169,7 +170,6 @@ class Sentencizer(Pipe):
path = path.with_suffix(".json")
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
-
def from_disk(self, path, *, exclude=tuple()):
"""Load the sentencizer from disk.
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 26f98ba59..df093baa9 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -1,8 +1,7 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from itertools import islice
from typing import Callable, Optional
-import srsly
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
from ..tokens.doc cimport Doc
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 47aae2bb7..34e85d49c 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,26 +1,18 @@
-# cython: infer_types=True, profile=True, binding=True
-import warnings
+# cython: infer_types=True, binding=True
from itertools import islice
from typing import Callable, Optional
import numpy
-import srsly
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
-from thinc.types import Floats2d
-from ..morphology cimport Morphology
from ..tokens.doc cimport Doc
-from ..vocab cimport Vocab
from .. import util
-from ..attrs import ID, POS
-from ..errors import Errors, Warnings
+from ..errors import Errors
from ..language import Language
-from ..parts_of_speech import X
from ..scorer import Scorer
from ..training import validate_examples, validate_get_examples
from ..util import registry
-from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe
# See #9050
@@ -169,7 +161,6 @@ class Tagger(TrainablePipe):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
- cdef Vocab vocab = self.vocab
cdef bint overwrite = self.cfg["overwrite"]
labels = self.labels
for i, doc in enumerate(docs):
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 7aa91ac16..8f219b327 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, binding=True
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple
import srsly
@@ -55,7 +55,7 @@ cdef class TrainablePipe(Pipe):
except Exception as e:
error_handler(self.name, self, [doc], e)
- def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
+ def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
@@ -102,9 +102,9 @@ cdef class TrainablePipe(Pipe):
def update(self,
examples: Iterable["Example"],
*,
- drop: float=0.0,
- sgd: Optimizer=None,
- losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
+ drop: float = 0.0,
+ sgd: Optimizer = None,
+ losses: Optional[Dict[str, float]] = None) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
@@ -138,8 +138,8 @@ cdef class TrainablePipe(Pipe):
def rehearse(self,
examples: Iterable[Example],
*,
- sgd: Optimizer=None,
- losses: Dict[str, float]=None,
+ sgd: Optimizer = None,
+ losses: Dict[str, float] = None,
**config) -> Dict[str, float]:
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model,
@@ -177,7 +177,7 @@ cdef class TrainablePipe(Pipe):
"""
return util.create_default_optimizer()
- def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
+ def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
"""Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each TrainablePipe component,
ensuring the internal model (if available) is initialized properly
diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd
index e5e88d521..7ddb91e01 100644
--- a/spacy/pipeline/transition_parser.pxd
+++ b/spacy/pipeline/transition_parser.pxd
@@ -13,8 +13,18 @@ cdef class Parser(TrainablePipe):
cdef readonly TransitionSystem moves
cdef public object _multitasks
- cdef void _parseC(self, CBlas cblas, StateC** states,
- WeightsC weights, SizesC sizes) nogil
+ cdef void _parseC(
+ self,
+ CBlas cblas,
+ StateC** states,
+ WeightsC weights,
+ SizesC sizes
+ ) nogil
- cdef void c_transition_batch(self, StateC** states, const float* scores,
- int nr_class, int batch_size) nogil
+ cdef void c_transition_batch(
+ self,
+ StateC** states,
+ const float* scores,
+ int nr_class,
+ int batch_size
+ ) nogil
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index ef4d9b362..9a278fc13 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -1,4 +1,5 @@
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
+# cython: profile=False
from __future__ import print_function
cimport numpy as np
@@ -7,20 +8,15 @@ from cymem.cymem cimport Pool
from itertools import islice
from libc.stdlib cimport calloc, free
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
from libcpp.vector cimport vector
import random
-import srsly
-from thinc.api import CupyOps, NumpyOps, get_ops, set_dropout_rate
-
-from thinc.extra.search cimport Beam
-
-import warnings
-
import numpy
import numpy.random
+import srsly
+from thinc.api import CupyOps, NumpyOps, set_dropout_rate
from ..ml.parser_model cimport (
ActivationsC,
@@ -42,7 +38,7 @@ from .trainable_pipe import TrainablePipe
from ._parser_internals cimport _beam_utils
from .. import util
-from ..errors import Errors, Warnings
+from ..errors import Errors
from ..training import validate_examples, validate_get_examples
from ._parser_internals import _beam_utils
@@ -258,7 +254,6 @@ cdef class Parser(TrainablePipe):
except Exception as e:
error_handler(self.name, self, batch_in_order, e)
-
def predict(self, docs):
if isinstance(docs, Doc):
docs = [docs]
@@ -300,8 +295,6 @@ cdef class Parser(TrainablePipe):
return batch
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
- cdef Beam beam
- cdef Doc doc
self._ensure_labels_are_added(docs)
batch = _beam_utils.BeamBatch(
self.moves,
@@ -321,16 +314,18 @@ cdef class Parser(TrainablePipe):
del model
return list(batch)
- cdef void _parseC(self, CBlas cblas, StateC** states,
- WeightsC weights, SizesC sizes) nogil:
- cdef int i, j
+ cdef void _parseC(
+ self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
+ ) nogil:
+ cdef int i
cdef vector[StateC*] unfinished
cdef ActivationsC activations = alloc_activations(sizes)
while sizes.states >= 1:
predict_states(cblas, &activations, states, &weights, sizes)
# Validate actions, argmax, take action.
- self.c_transition_batch(states,
- activations.scores, sizes.classes, sizes.states)
+ self.c_transition_batch(
+ states, activations.scores, sizes.classes, sizes.states
+ )
for i in range(sizes.states):
if not states[i].is_final():
unfinished.push_back(states[i])
@@ -342,7 +337,6 @@ cdef class Parser(TrainablePipe):
def set_annotations(self, docs, states_or_beams):
cdef StateClass state
- cdef Beam beam
cdef Doc doc
states = _beam_utils.collect_states(states_or_beams, docs)
for i, (state, doc) in enumerate(zip(states, docs)):
@@ -359,8 +353,13 @@ cdef class Parser(TrainablePipe):
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
return [state for state in states if not state.c.is_final()]
- cdef void c_transition_batch(self, StateC** states, const float* scores,
- int nr_class, int batch_size) nogil:
+ cdef void c_transition_batch(
+ self,
+ StateC** states,
+ const float* scores,
+ int nr_class,
+ int batch_size
+ ) nogil:
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
with gil:
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
@@ -380,7 +379,6 @@ cdef class Parser(TrainablePipe):
free(is_valid)
def update(self, examples, *, drop=0., sgd=None, losses=None):
- cdef StateClass state
if losses is None:
losses = {}
losses.setdefault(self.name, 0.)
@@ -419,8 +417,7 @@ cdef class Parser(TrainablePipe):
if not states:
return losses
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
-
- all_states = list(states)
+
states_golds = list(zip(states, golds))
n_moves = 0
while states_golds:
@@ -500,8 +497,16 @@ cdef class Parser(TrainablePipe):
del tutor
return losses
- def update_beam(self, examples, *, beam_width,
- drop=0., sgd=None, losses=None, beam_density=0.0):
+ def update_beam(
+ self,
+ examples,
+ *,
+ beam_width,
+ drop=0.,
+ sgd=None,
+ losses=None,
+ beam_density=0.0
+ ):
states, golds, _ = self.moves.init_gold_batch(examples)
if not states:
return losses
@@ -531,8 +536,9 @@ cdef class Parser(TrainablePipe):
is_valid = mem.alloc(self.moves.n_moves, sizeof(int))
costs = mem.alloc(self.moves.n_moves, sizeof(float))
- cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
- dtype='f', order='C')
+ cdef np.ndarray d_scores = numpy.zeros(
+ (len(states), self.moves.n_moves), dtype='f', order='C'
+ )
c_d_scores = d_scores.data
unseen_classes = self.model.attrs["unseen_classes"]
for i, (state, gold) in enumerate(zip(states, golds)):
@@ -542,8 +548,9 @@ cdef class Parser(TrainablePipe):
for j in range(self.moves.n_moves):
if costs[j] <= 0.0 and j in unseen_classes:
unseen_classes.remove(j)
- cpu_log_loss(c_d_scores,
- costs, is_valid, &scores[i, 0], d_scores.shape[1])
+ cpu_log_loss(
+ c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
+ )
c_d_scores += d_scores.shape[1]
# Note that we don't normalize this. See comment in update() for why.
if losses is not None:
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 22f45372c..fa987b90f 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -16,19 +16,34 @@ from typing import (
Union,
)
-from pydantic import (
- BaseModel,
- ConstrainedStr,
- Field,
- StrictBool,
- StrictFloat,
- StrictInt,
- StrictStr,
- ValidationError,
- create_model,
- validator,
-)
-from pydantic.main import ModelMetaclass
+try:
+ from pydantic.v1 import (
+ BaseModel,
+ ConstrainedStr,
+ Field,
+ StrictBool,
+ StrictFloat,
+ StrictInt,
+ StrictStr,
+ ValidationError,
+ create_model,
+ validator,
+ )
+ from pydantic.v1.main import ModelMetaclass
+except ImportError:
+ from pydantic import ( # type: ignore
+ BaseModel,
+ ConstrainedStr,
+ Field,
+ StrictBool,
+ StrictFloat,
+ StrictInt,
+ StrictStr,
+ ValidationError,
+ create_model,
+ validator,
+ )
+ from pydantic.main import ModelMetaclass # type: ignore
from thinc.api import ConfigValidationError, Model, Optimizer
from thinc.config import Promise
@@ -397,6 +412,7 @@ class ConfigSchemaNlp(BaseModel):
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
batch_size: Optional[int] = Field(..., title="Default batch size")
+ vectors: Callable = Field(..., title="Vectors implementation")
# fmt: on
class Config:
@@ -465,66 +481,6 @@ CONFIG_SCHEMAS = {
"initialize": ConfigSchemaInit,
}
-
-# Project config Schema
-
-
-class ProjectConfigAssetGitItem(BaseModel):
- # fmt: off
- repo: StrictStr = Field(..., title="URL of Git repo to download from")
- path: StrictStr = Field(..., title="File path or sub-directory to download (used for sparse checkout)")
- branch: StrictStr = Field("master", title="Branch to clone from")
- # fmt: on
-
-
-class ProjectConfigAssetURL(BaseModel):
- # fmt: off
- dest: StrictStr = Field(..., title="Destination of downloaded asset")
- url: Optional[StrictStr] = Field(None, title="URL of asset")
- checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
- description: StrictStr = Field("", title="Description of asset")
- # fmt: on
-
-
-class ProjectConfigAssetGit(BaseModel):
- # fmt: off
- git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
- checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
- description: Optional[StrictStr] = Field(None, title="Description of asset")
- # fmt: on
-
-
-class ProjectConfigCommand(BaseModel):
- # fmt: off
- name: StrictStr = Field(..., title="Name of command")
- help: Optional[StrictStr] = Field(None, title="Command description")
- script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
- deps: List[StrictStr] = Field([], title="File dependencies required by this command")
- outputs: List[StrictStr] = Field([], title="Outputs produced by this command")
- outputs_no_cache: List[StrictStr] = Field([], title="Outputs not tracked by DVC (DVC only)")
- no_skip: bool = Field(False, title="Never skip this command, even if nothing changed")
- # fmt: on
-
- class Config:
- title = "A single named command specified in a project config"
- extra = "forbid"
-
-
-class ProjectConfigSchema(BaseModel):
- # fmt: off
- vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands")
- env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names")
- assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets")
- workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
- commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
- title: Optional[str] = Field(None, title="Project title")
- spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
- # fmt: on
-
- class Config:
- title = "Schema for project configuration file"
-
-
# Recommendations for init config workflows
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 16c3e2b5b..376a13175 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -1,8 +1,8 @@
# cython: infer_types=True
+# cython: profile=False
cimport cython
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
-from libcpp.set cimport set
from murmurhash.mrmr cimport hash32, hash64
import srsly
@@ -20,9 +20,10 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
try:
out_hash[0] = key
return True
- except:
+ except: # no-cython-lint
return False
+
def get_string_id(key):
"""Get a string ID, handling the reserved symbols correctly. If the key is
already an ID, return it.
@@ -87,7 +88,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
cdef int n_length_bytes
cdef int i
cdef Utf8Str* string = mem.alloc(1, sizeof(Utf8Str))
- cdef uint32_t ulength = length
if length < sizeof(string.s):
string.s[0] = length
memcpy(&string.s[1], chars, length)
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 9efb068fd..8cfcc2964 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -52,7 +52,7 @@ cdef struct TokenC:
int sent_start
int ent_iob
- attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+ attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_kb_id
hash_t ent_id
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index bc15d9b80..73be19145 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -92,7 +92,7 @@ cdef enum symbol_t:
ADV
AUX
CONJ
- CCONJ # U20
+ CCONJ # U20
DET
INTJ
NOUN
@@ -418,7 +418,7 @@ cdef enum symbol_t:
ccomp
complm
conj
- cop # U20
+ cop # U20
csubj
csubjpass
dep
@@ -441,8 +441,8 @@ cdef enum symbol_t:
num
number
oprd
- obj # U20
- obl # U20
+ obj # U20
+ obl # U20
parataxis
partmod
pcomp
diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx
index b0345c710..f7713577b 100644
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@@ -1,4 +1,5 @@
# cython: optimize.unpack_method_calls=False
+# cython: profile=False
IDS = {
"": NIL,
"IS_ALPHA": IS_ALPHA,
@@ -96,7 +97,7 @@ IDS = {
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
- "CCONJ": CCONJ, # U20
+ "CCONJ": CCONJ, # U20
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
@@ -421,7 +422,7 @@ IDS = {
"ccomp": ccomp,
"complm": complm,
"conj": conj,
- "cop": cop, # U20
+ "cop": cop, # U20
"csubj": csubj,
"csubjpass": csubjpass,
"dep": dep,
@@ -444,8 +445,8 @@ IDS = {
"num": num,
"number": number,
"oprd": oprd,
- "obj": obj, # U20
- "obl": obl, # U20
+ "obj": obj, # U20
+ "obl": obl, # U20
"parataxis": parataxis,
"partmod": partmod,
"pcomp": pcomp,
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index 44b3bb26b..be33f90cf 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -216,6 +216,11 @@ def test_dependency_matcher_pattern_validation(en_vocab):
pattern2 = copy.deepcopy(pattern)
pattern2[1]["RIGHT_ID"] = "fox"
matcher.add("FOUNDED", [pattern2])
+ # invalid key
+ with pytest.warns(UserWarning):
+ pattern2 = copy.deepcopy(pattern)
+ pattern2[1]["FOO"] = "BAR"
+ matcher.add("FOUNDED", [pattern2])
def test_dependency_matcher_callback(en_vocab, doc):
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 21fa36865..45f9f4ee7 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -52,7 +52,8 @@ TEST_PATTERNS = [
@pytest.mark.parametrize(
- "pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
+ "pattern",
+ [[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]],
)
def test_matcher_pattern_validation(en_vocab, pattern):
matcher = Matcher(en_vocab, validate=True)
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 9e83d5fb1..ff07c5b45 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -4,14 +4,15 @@ from pathlib import Path
def test_build_dependencies():
# Check that library requirements are pinned exactly the same across different setup files.
- # TODO: correct checks for numpy rather than ignoring
libs_ignore_requirements = [
+ "numpy",
"pytest",
"pytest-timeout",
"mock",
"flake8",
"hypothesis",
"pre-commit",
+ "cython-lint",
"black",
"isort",
"mypy",
@@ -22,6 +23,7 @@ def test_build_dependencies():
]
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [
+ "numpy",
"fugashi",
"natto-py",
"pythainlp",
diff --git a/spacy/tests/pipeline/test_initialize.py b/spacy/tests/pipeline/test_initialize.py
index 6dd4114f1..9854b391e 100644
--- a/spacy/tests/pipeline/test_initialize.py
+++ b/spacy/tests/pipeline/test_initialize.py
@@ -1,5 +1,10 @@
import pytest
-from pydantic import StrictBool
+
+try:
+ from pydantic.v1 import StrictBool
+except ImportError:
+ from pydantic import StrictBool # type: ignore
+
from thinc.api import ConfigValidationError
from spacy.lang.en import English
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index 0f1454b55..83b986784 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,5 +1,10 @@
import pytest
-from pydantic import StrictInt, StrictStr
+
+try:
+ from pydantic.v1 import StrictInt, StrictStr
+except ImportError:
+ from pydantic import StrictInt, StrictStr # type: ignore
+
from thinc.api import ConfigValidationError, Linear, Model
import spacy
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 8e1c9ca32..0d2fe0a9e 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -1,31 +1,20 @@
import math
import os
-import time
from collections import Counter
from pathlib import Path
from typing import Any, Dict, List, Tuple
-import numpy
import pytest
import srsly
from click import NoSuchOption
from packaging.specifiers import SpecifierSet
-from thinc.api import Config, ConfigValidationError
+from thinc.api import Config
import spacy
from spacy import about
+from spacy import info as spacy_info
from spacy.cli import info
-from spacy.cli._util import (
- download_file,
- is_subpath_of,
- load_project_config,
- parse_config_overrides,
- string_to_list,
- substitute_project_variables,
- upload_file,
- validate_project_commands,
- walk_directory,
-)
+from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory
from spacy.cli.apply import apply
from spacy.cli.debug_data import (
_compile_gold,
@@ -43,13 +32,11 @@ from spacy.cli.find_threshold import find_threshold
from spacy.cli.init_config import RECOMMENDATIONS, fill_config, init_config
from spacy.cli.init_pipeline import _init_labels
from spacy.cli.package import _is_permitted_package_name, get_third_party_dependencies
-from spacy.cli.project.remote_storage import RemoteStorage
-from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import Language
-from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.schemas import RecommendationSchema
from spacy.tokens import Doc, DocBin
from spacy.tokens.span import Span
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
@@ -134,25 +121,6 @@ def test_issue7055():
assert "model" in filled_cfg["components"]["ner"]
-@pytest.mark.issue(11235)
-def test_issue11235():
- """
- Test that the cli handles interpolation in the directory names correctly when loading project config.
- """
- lang_var = "en"
- variables = {"lang": lang_var}
- commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
- directories = ["cfg", "${vars.lang}_model"]
- project = {"commands": commands, "vars": variables, "directories": directories}
- with make_tempdir() as d:
- srsly.write_yaml(d / "project.yml", project)
- cfg = load_project_config(d)
- # Check that the directories are interpolated and created correctly
- assert os.path.exists(d / "cfg")
- assert os.path.exists(d / f"{lang_var}_model")
- assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
-
-
@pytest.mark.issue(12566)
@pytest.mark.parametrize(
"factory,output_file",
@@ -225,6 +193,9 @@ def test_cli_info():
raw_data = info(tmp_dir, exclude=[""])
assert raw_data["lang"] == "nl"
assert raw_data["components"] == ["textcat"]
+ raw_data = spacy_info(tmp_dir, exclude=[""])
+ assert raw_data["lang"] == "nl"
+ assert raw_data["components"] == ["textcat"]
def test_cli_converters_conllu_to_docs():
@@ -443,136 +414,6 @@ def test_cli_converters_conll_ner_to_docs():
assert ent.text in ["New York City", "London"]
-def test_project_config_validation_full():
- config = {
- "vars": {"some_var": 20},
- "directories": ["assets", "configs", "corpus", "scripts", "training"],
- "assets": [
- {
- "dest": "x",
- "extra": True,
- "url": "https://example.com",
- "checksum": "63373dd656daa1fd3043ce166a59474c",
- },
- {
- "dest": "y",
- "git": {
- "repo": "https://github.com/example/repo",
- "branch": "develop",
- "path": "y",
- },
- },
- {
- "dest": "z",
- "extra": False,
- "url": "https://example.com",
- "checksum": "63373dd656daa1fd3043ce166a59474c",
- },
- ],
- "commands": [
- {
- "name": "train",
- "help": "Train a model",
- "script": ["python -m spacy train config.cfg -o training"],
- "deps": ["config.cfg", "corpus/training.spcy"],
- "outputs": ["training/model-best"],
- },
- {"name": "test", "script": ["pytest", "custom.py"], "no_skip": True},
- ],
- "workflows": {"all": ["train", "test"], "train": ["train"]},
- }
- errors = validate(ProjectConfigSchema, config)
- assert not errors
-
-
-@pytest.mark.parametrize(
- "config",
- [
- {"commands": [{"name": "a"}, {"name": "a"}]},
- {"commands": [{"name": "a"}], "workflows": {"a": []}},
- {"commands": [{"name": "a"}], "workflows": {"b": ["c"]}},
- ],
-)
-def test_project_config_validation1(config):
- with pytest.raises(SystemExit):
- validate_project_commands(config)
-
-
-@pytest.mark.parametrize(
- "config,n_errors",
- [
- ({"commands": {"a": []}}, 1),
- ({"commands": [{"help": "..."}]}, 1),
- ({"commands": [{"name": "a", "extra": "b"}]}, 1),
- ({"commands": [{"extra": "b"}]}, 2),
- ({"commands": [{"name": "a", "deps": [123]}]}, 1),
- ],
-)
-def test_project_config_validation2(config, n_errors):
- errors = validate(ProjectConfigSchema, config)
- assert len(errors) == n_errors
-
-
-@pytest.mark.parametrize(
- "int_value",
- [10, pytest.param("10", marks=pytest.mark.xfail)],
-)
-def test_project_config_interpolation(int_value):
- variables = {"a": int_value, "b": {"c": "foo", "d": True}}
- commands = [
- {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
- {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
- ]
- project = {"commands": commands, "vars": variables}
- with make_tempdir() as d:
- srsly.write_yaml(d / "project.yml", project)
- cfg = load_project_config(d)
- assert type(cfg) == dict
- assert type(cfg["commands"]) == list
- assert cfg["commands"][0]["script"][0] == "hello 10 foo"
- assert cfg["commands"][1]["script"][0] == "foo true"
- commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
- project = {"commands": commands, "vars": variables}
- with pytest.raises(ConfigValidationError):
- substitute_project_variables(project)
-
-
-@pytest.mark.parametrize(
- "greeting",
- [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)],
-)
-def test_project_config_interpolation_override(greeting):
- variables = {"a": "world"}
- commands = [
- {"name": "x", "script": ["hello ${vars.a}"]},
- ]
- overrides = {"vars.a": greeting}
- project = {"commands": commands, "vars": variables}
- with make_tempdir() as d:
- srsly.write_yaml(d / "project.yml", project)
- cfg = load_project_config(d, overrides=overrides)
- assert type(cfg) == dict
- assert type(cfg["commands"]) == list
- assert cfg["commands"][0]["script"][0] == f"hello {greeting}"
-
-
-def test_project_config_interpolation_env():
- variables = {"a": 10}
- env_var = "SPACY_TEST_FOO"
- env_vars = {"foo": env_var}
- commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}]
- project = {"commands": commands, "vars": variables, "env": env_vars}
- with make_tempdir() as d:
- srsly.write_yaml(d / "project.yml", project)
- cfg = load_project_config(d)
- assert cfg["commands"][0]["script"][0] == "hello 10 "
- os.environ[env_var] = "123"
- with make_tempdir() as d:
- srsly.write_yaml(d / "project.yml", project)
- cfg = load_project_config(d)
- assert cfg["commands"][0]["script"][0] == "hello 10 123"
-
-
@pytest.mark.parametrize(
"args,expected",
[
@@ -782,21 +623,6 @@ def test_get_third_party_dependencies():
get_third_party_dependencies(nlp.config)
-@pytest.mark.parametrize(
- "parent,child,expected",
- [
- ("/tmp", "/tmp", True),
- ("/tmp", "/", False),
- ("/tmp", "/tmp/subdir", True),
- ("/tmp", "/tmpdir", False),
- ("/tmp", "/tmp/subdir/..", True),
- ("/tmp", "/tmp/..", False),
- ],
-)
-def test_is_subpath_of(parent, child, expected):
- assert is_subpath_of(parent, child) == expected
-
-
@pytest.mark.slow
@pytest.mark.parametrize(
"factory_name,pipe_name",
@@ -1042,60 +868,6 @@ def test_applycli_user_data():
assert result[0]._.ext == val
-def test_local_remote_storage():
- with make_tempdir() as d:
- filename = "a.txt"
-
- content_hashes = ("aaaa", "cccc", "bbbb")
- for i, content_hash in enumerate(content_hashes):
- # make sure that each subsequent file has a later timestamp
- if i > 0:
- time.sleep(1)
- content = f"{content_hash} content"
- loc_file = d / "root" / filename
- if not loc_file.parent.exists():
- loc_file.parent.mkdir(parents=True)
- with loc_file.open(mode="w") as file_:
- file_.write(content)
-
- # push first version to remote storage
- remote = RemoteStorage(d / "root", str(d / "remote"))
- remote.push(filename, "aaaa", content_hash)
-
- # retrieve with full hashes
- loc_file.unlink()
- remote.pull(filename, command_hash="aaaa", content_hash=content_hash)
- with loc_file.open(mode="r") as file_:
- assert file_.read() == content
-
- # retrieve with command hash
- loc_file.unlink()
- remote.pull(filename, command_hash="aaaa")
- with loc_file.open(mode="r") as file_:
- assert file_.read() == content
-
- # retrieve with content hash
- loc_file.unlink()
- remote.pull(filename, content_hash=content_hash)
- with loc_file.open(mode="r") as file_:
- assert file_.read() == content
-
- # retrieve with no hashes
- loc_file.unlink()
- remote.pull(filename)
- with loc_file.open(mode="r") as file_:
- assert file_.read() == content
-
-
-def test_local_remote_storage_pull_missing():
- # pulling from a non-existent remote pulls nothing gracefully
- with make_tempdir() as d:
- filename = "a.txt"
- remote = RemoteStorage(d / "root", str(d / "remote"))
- assert remote.pull(filename, command_hash="aaaa") is None
- assert remote.pull(filename) is None
-
-
def test_cli_find_threshold(capsys):
def make_examples(nlp: Language) -> List[Example]:
docs: List[Example] = []
@@ -1206,63 +978,6 @@ def test_cli_find_threshold(capsys):
)
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-@pytest.mark.parametrize(
- "reqs,output",
- [
- [
- """
- spacy
-
- # comment
-
- thinc""",
- (False, False),
- ],
- [
- """# comment
- --some-flag
- spacy""",
- (False, False),
- ],
- [
- """# comment
- --some-flag
- spacy; python_version >= '3.6'""",
- (False, False),
- ],
- [
- """# comment
- spacyunknowndoesnotexist12345""",
- (True, False),
- ],
- ],
-)
-def test_project_check_requirements(reqs, output):
- import pkg_resources
-
- # excessive guard against unlikely package name
- try:
- pkg_resources.require("spacyunknowndoesnotexist12345")
- except pkg_resources.DistributionNotFound:
- assert output == _check_requirements([req.strip() for req in reqs.split("\n")])
-
-
-def test_upload_download_local_file():
- with make_tempdir() as d1, make_tempdir() as d2:
- filename = "f.txt"
- content = "content"
- local_file = d1 / filename
- remote_file = d2 / filename
- with local_file.open(mode="w") as file_:
- file_.write(content)
- upload_file(local_file, remote_file)
- local_file.unlink()
- download_file(remote_file, local_file)
- with local_file.open(mode="r") as file_:
- assert file_.read() == content
-
-
def test_walk_directory():
with make_tempdir() as d:
files = [
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 3a426113b..108fbf90d 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,4 +1,5 @@
import os
+import sys
from pathlib import Path
import pytest
@@ -6,7 +7,7 @@ import srsly
from typer.testing import CliRunner
from spacy.cli._util import app, get_git_version
-from spacy.tokens import Doc, DocBin
+from spacy.tokens import Doc, DocBin, Span
from .util import make_tempdir, normalize_whitespace
@@ -213,6 +214,9 @@ def test_project_clone(options):
assert (out / "README.md").is_file()
+@pytest.mark.skipif(
+ sys.version_info >= (3, 12), reason="Python 3.12+ not supported for remotes"
+)
def test_project_push_pull(project_dir):
proj = dict(SAMPLE_PROJECT)
remote = "xyz"
@@ -233,3 +237,196 @@ def test_project_push_pull(project_dir):
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()
+
+
+def test_find_function_valid():
+ # example of architecture in main code base
+ function = "spacy.TextCatBOW.v2"
+ result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
+ assert f"Found registered function '{function}'" in result.stdout
+ assert "textcat.py" in result.stdout
+
+ result = CliRunner().invoke(app, ["find-function", function])
+ assert f"Found registered function '{function}'" in result.stdout
+ assert "textcat.py" in result.stdout
+
+ # example of architecture in spacy-legacy
+ function = "spacy.TextCatBOW.v1"
+ result = CliRunner().invoke(app, ["find-function", function])
+ assert f"Found registered function '{function}'" in result.stdout
+ assert "spacy_legacy" in result.stdout
+ assert "textcat.py" in result.stdout
+
+
+def test_find_function_invalid():
+ # invalid registry
+ function = "spacy.TextCatBOW.v2"
+ registry = "foobar"
+ result = CliRunner().invoke(
+ app, ["find-function", function, "--registry", registry]
+ )
+ assert f"Unknown function registry: '{registry}'" in result.stdout
+
+ # invalid function
+ function = "spacy.TextCatBOW.v666"
+ result = CliRunner().invoke(app, ["find-function", function])
+ assert f"Couldn't find registered function: '{function}'" in result.stdout
+
+
+example_words_1 = ["I", "like", "cats"]
+example_words_2 = ["I", "like", "dogs"]
+example_lemmas_1 = ["I", "like", "cat"]
+example_lemmas_2 = ["I", "like", "dog"]
+example_tags = ["PRP", "VBP", "NNS"]
+example_morphs = [
+ "Case=Nom|Number=Sing|Person=1|PronType=Prs",
+ "Tense=Pres|VerbForm=Fin",
+ "Number=Plur",
+]
+example_deps = ["nsubj", "ROOT", "dobj"]
+example_pos = ["PRON", "VERB", "NOUN"]
+example_ents = ["O", "O", "I-ANIMAL"]
+example_spans = [(2, 3, "ANIMAL")]
+
+TRAIN_EXAMPLE_1 = dict(
+ words=example_words_1,
+ lemmas=example_lemmas_1,
+ tags=example_tags,
+ morphs=example_morphs,
+ deps=example_deps,
+ heads=[1, 1, 1],
+ pos=example_pos,
+ ents=example_ents,
+ spans=example_spans,
+ cats={"CAT": 1.0, "DOG": 0.0},
+)
+TRAIN_EXAMPLE_2 = dict(
+ words=example_words_2,
+ lemmas=example_lemmas_2,
+ tags=example_tags,
+ morphs=example_morphs,
+ deps=example_deps,
+ heads=[1, 1, 1],
+ pos=example_pos,
+ ents=example_ents,
+ spans=example_spans,
+ cats={"CAT": 0.0, "DOG": 1.0},
+)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+ "component,examples",
+ [
+ ("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+ ("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+ ("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+ ("parser", [TRAIN_EXAMPLE_1] * 30),
+ ("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+ ("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+ ("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
+ ],
+)
+def test_init_config_trainable(component, examples, en_vocab):
+ if component == "textcat":
+ train_docs = []
+ for example in examples:
+ doc = Doc(en_vocab, words=example["words"])
+ doc.cats = example["cats"]
+ train_docs.append(doc)
+ elif component == "spancat":
+ train_docs = []
+ for example in examples:
+ doc = Doc(en_vocab, words=example["words"])
+ doc.spans["sc"] = [
+ Span(doc, start, end, label) for start, end, label in example["spans"]
+ ]
+ train_docs.append(doc)
+ else:
+ train_docs = []
+ for example in examples:
+ # cats, spans are not valid kwargs for instantiating a Doc
+ example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
+ doc = Doc(en_vocab, **example)
+ train_docs.append(doc)
+
+ with make_tempdir() as d_in:
+ train_bin = DocBin(docs=train_docs)
+ train_bin.to_disk(d_in / "train.spacy")
+ dev_bin = DocBin(docs=train_docs)
+ dev_bin.to_disk(d_in / "dev.spacy")
+ init_config_result = CliRunner().invoke(
+ app,
+ [
+ "init",
+ "config",
+ f"{d_in}/config.cfg",
+ "--lang",
+ "en",
+ "--pipeline",
+ component,
+ ],
+ )
+ assert init_config_result.exit_code == 0
+ train_result = CliRunner().invoke(
+ app,
+ [
+ "train",
+ f"{d_in}/config.cfg",
+ "--paths.train",
+ f"{d_in}/train.spacy",
+ "--paths.dev",
+ f"{d_in}/dev.spacy",
+ "--output",
+ f"{d_in}/model",
+ ],
+ )
+ assert train_result.exit_code == 0
+ assert Path(d_in / "model" / "model-last").exists()
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+ "component,examples",
+ [("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)],
+)
+def test_init_config_trainable_multiple(component, examples, en_vocab):
+ train_docs = []
+ for example in examples:
+ example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
+ doc = Doc(en_vocab, **example)
+ train_docs.append(doc)
+
+ with make_tempdir() as d_in:
+ train_bin = DocBin(docs=train_docs)
+ train_bin.to_disk(d_in / "train.spacy")
+ dev_bin = DocBin(docs=train_docs)
+ dev_bin.to_disk(d_in / "dev.spacy")
+ init_config_result = CliRunner().invoke(
+ app,
+ [
+ "init",
+ "config",
+ f"{d_in}/config.cfg",
+ "--lang",
+ "en",
+ "--pipeline",
+ component,
+ ],
+ )
+ assert init_config_result.exit_code == 0
+ train_result = CliRunner().invoke(
+ app,
+ [
+ "train",
+ f"{d_in}/config.cfg",
+ "--paths.train",
+ f"{d_in}/train.spacy",
+ "--paths.dev",
+ f"{d_in}/dev.spacy",
+ "--output",
+ f"{d_in}/model",
+ ],
+ )
+ assert train_result.exit_code == 0
+ assert Path(d_in / "model" / "model-last").exists()
diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
index ce103068a..12d903dca 100644
--- a/spacy/tests/test_displacy.py
+++ b/spacy/tests/test_displacy.py
@@ -113,7 +113,7 @@ def test_issue5838():
doc = nlp(sample_text)
doc.ents = [Span(doc, 7, 8, label="test")]
html = displacy.render(doc, style="ent")
- found = html.count("")
+ found = html.count(" ")
assert found == 4
@@ -350,6 +350,78 @@ def test_displacy_render_wrapper(en_vocab):
displacy.set_render_wrapper(lambda html: html)
+def test_displacy_render_manual_dep():
+ """Test displacy.render with manual data for dep style"""
+ parsed_dep = {
+ "words": [
+ {"text": "This", "tag": "DT"},
+ {"text": "is", "tag": "VBZ"},
+ {"text": "a", "tag": "DT"},
+ {"text": "sentence", "tag": "NN"},
+ ],
+ "arcs": [
+ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+ {"start": 2, "end": 3, "label": "det", "dir": "left"},
+ {"start": 1, "end": 3, "label": "attr", "dir": "right"},
+ ],
+ "title": "Title",
+ }
+ html = displacy.render([parsed_dep], style="dep", manual=True)
+ for word in parsed_dep["words"]:
+ assert word["text"] in html
+ assert word["tag"] in html
+
+
+def test_displacy_render_manual_ent():
+ """Test displacy.render with manual data for ent style"""
+ parsed_ents = [
+ {
+ "text": "But Google is starting from behind.",
+ "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+ },
+ {
+ "text": "But Google is starting from behind.",
+ "ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
+ "title": "Title",
+ },
+ ]
+
+ html = displacy.render(parsed_ents, style="ent", manual=True)
+ for parsed_ent in parsed_ents:
+ assert parsed_ent["ents"][0]["label"] in html
+ if "title" in parsed_ent:
+ assert parsed_ent["title"] in html
+
+
+def test_displacy_render_manual_span():
+ """Test displacy.render with manual data for span style"""
+ parsed_spans = [
+ {
+ "text": "Welcome to the Bank of China.",
+ "spans": [
+ {"start_token": 3, "end_token": 6, "label": "ORG"},
+ {"start_token": 5, "end_token": 6, "label": "GPE"},
+ ],
+ "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
+ },
+ {
+ "text": "Welcome to the Bank of China.",
+ "spans": [
+ {"start_token": 3, "end_token": 6, "label": "ORG"},
+ {"start_token": 5, "end_token": 6, "label": "GPE"},
+ ],
+ "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
+ "title": "Title",
+ },
+ ]
+
+ html = displacy.render(parsed_spans, style="span", manual=True)
+ for parsed_span in parsed_spans:
+ assert parsed_span["spans"][0]["label"] in html
+ if "title" in parsed_span:
+ assert parsed_span["title"] in html
+
+
def test_displacy_options_case():
ents = ["foo", "BAR"]
colors = {"FOO": "red", "bar": "green"}
@@ -377,3 +449,22 @@ def test_displacy_manual_sorted_entities():
html = displacy.render(doc, style="ent", manual=True)
assert html.find("FIRST") < html.find("SECOND")
+
+
+@pytest.mark.issue(12816)
+def test_issue12816(en_vocab) -> None:
+ """Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
+ # Create a doc containing an annotated word and an unannotated HTML tag
+ doc = Doc(en_vocab, words=["test", ""])
+ doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
+
+ # Verify that the HTML tag is escaped when unannotated
+ html = displacy.render(doc, style="span")
+ assert "<TEST>" in html
+
+ # Annotate the HTML tag
+ doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
+
+ # Verify that the HTML tag is still escaped
+ html = displacy.render(doc, style="span")
+ assert "<TEST>" in html
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 438f458ec..704a40485 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -3,7 +3,12 @@ import os
from pathlib import Path
import pytest
-from pydantic import ValidationError
+
+try:
+ from pydantic.v1 import ValidationError
+except ImportError:
+ from pydantic import ValidationError # type: ignore
+
from thinc.api import (
Config,
ConfigValidationError,
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index f7585b45a..a902ebad9 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -31,24 +31,58 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1
- cdef void _filter_special_spans(self, vector[SpanC] &original,
- vector[SpanC] &filtered, int doc_len) nogil
- cdef object _prepare_special_spans(self, Doc doc,
- vector[SpanC] &filtered)
- cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
- object span_data)
- cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
- int* has_special,
- bint with_special_cases) except -1
- cdef int _tokenize(self, Doc tokens, str span, hash_t key,
- int* has_special, bint with_special_cases) except -1
- cdef str _split_affixes(self, Pool mem, str string,
- vector[LexemeC*] *prefixes,
- vector[LexemeC*] *suffixes, int* has_special,
- bint with_special_cases)
- cdef int _attach_tokens(self, Doc tokens, str string,
- vector[LexemeC*] *prefixes,
- vector[LexemeC*] *suffixes, int* has_special,
- bint with_special_cases) except -1
- cdef int _save_cached(self, const TokenC* tokens, hash_t key,
- int* has_special, int n) except -1
+ cdef void _filter_special_spans(
+ self,
+ vector[SpanC] &original,
+ vector[SpanC] &filtered,
+ int doc_len,
+ ) nogil
+ cdef object _prepare_special_spans(
+ self,
+ Doc doc,
+ vector[SpanC] &filtered,
+ )
+ cdef int _retokenize_special_spans(
+ self,
+ Doc doc,
+ TokenC* tokens,
+ object span_data,
+ )
+ cdef int _try_specials_and_cache(
+ self,
+ hash_t key,
+ Doc tokens,
+ int* has_special,
+ bint with_special_cases,
+ ) except -1
+ cdef int _tokenize(
+ self,
+ Doc tokens,
+ str span,
+ hash_t key,
+ int* has_special,
+ bint with_special_cases,
+ ) except -1
+ cdef str _split_affixes(
+ self,
+ Pool mem,
+ str string,
+ vector[LexemeC*] *prefixes,
+ vector[LexemeC*] *suffixes, int* has_special,
+ bint with_special_cases,
+ )
+ cdef int _attach_tokens(
+ self,
+ Doc tokens,
+ str string,
+ vector[LexemeC*] *prefixes,
+ vector[LexemeC*] *suffixes, int* has_special,
+ bint with_special_cases,
+ ) except -1
+ cdef int _save_cached(
+ self,
+ const TokenC* tokens,
+ hash_t key,
+ int* has_special,
+ int n,
+ ) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 3861b1cee..a239eaf45 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,4 +1,4 @@
-# cython: embedsignature=True, profile=True, binding=True
+# cython: embedsignature=True, binding=True
cimport cython
from cymem.cymem cimport Pool
from cython.operator cimport dereference as deref
@@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
from preshed.maps cimport PreshMap
import re
-import warnings
-
from .lexeme cimport EMPTY_LEXEME
from .strings cimport hash_string
from .tokens.doc cimport Doc
from . import util
from .attrs import intify_attrs
-from .errors import Errors, Warnings
+from .errors import Errors
from .scorer import Scorer
from .symbols import NORM, ORTH
from .tokens import Span
from .training import validate_examples
-from .util import get_words_and_spaces, registry
+from .util import get_words_and_spaces
cdef class Tokenizer:
@@ -324,7 +322,7 @@ cdef class Tokenizer:
cdef int span_start
cdef int span_end
while i < doc.length:
- if not i in span_data:
+ if i not in span_data:
tokens[i + offset] = doc.c[i]
i += 1
else:
@@ -395,12 +393,15 @@ cdef class Tokenizer:
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
- cdef str _split_affixes(self, Pool mem, str string,
- vector[const LexemeC*] *prefixes,
- vector[const LexemeC*] *suffixes,
- int* has_special,
- bint with_special_cases):
- cdef size_t i
+ cdef str _split_affixes(
+ self,
+ Pool mem,
+ str string,
+ vector[const LexemeC*] *prefixes,
+ vector[const LexemeC*] *suffixes,
+ int* has_special,
+ bint with_special_cases
+ ):
cdef str prefix
cdef str suffix
cdef str minus_pre
@@ -445,10 +446,6 @@ cdef class Tokenizer:
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases) except -1:
- cdef bint specials_hit = 0
- cdef bint cache_hit = 0
- cdef int split, end
- cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
cdef str span
cdef int i
@@ -458,9 +455,11 @@ cdef class Tokenizer:
if string:
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
pass
- elif (self.token_match and self.token_match(string)) or \
- (self.url_match and \
- self.url_match(string)):
+ elif (
+ (self.token_match and self.token_match(string)) or
+ (self.url_match and self.url_match(string))
+ ):
+
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
@@ -821,7 +820,7 @@ cdef class Tokenizer:
self.infix_finditer = None
self.token_match = None
self.url_match = None
- msg = util.from_bytes(bytes_data, deserializers, exclude)
+ util.from_bytes(bytes_data, deserializers, exclude)
if "prefix_search" in data and isinstance(data["prefix_search"], str):
self.prefix_search = re.compile(data["prefix_search"]).search
if "suffix_search" in data and isinstance(data["suffix_search"], str):
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 8ed707ab9..b0e4ff85c 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -1,7 +1,6 @@
-# cython: infer_types=True, bounds_check=False, profile=True
+# cython: infer_types=True, bounds_check=False
from cymem.cymem cimport Pool
-from libc.stdlib cimport free, malloc
-from libc.string cimport memcpy, memset
+from libc.string cimport memset
import numpy
from thinc.api import get_array_module
@@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
from ..lexeme cimport EMPTY_LEXEME, Lexeme
from ..structs cimport LexemeC, TokenC
from ..vocab cimport Vocab
-from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
+from .doc cimport Doc, set_children_from_heads, token_by_start
from .span cimport Span
from .token cimport Token
@@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
- cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
+ cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
cdef Span span
cdef const LexemeC* lex
cdef TokenC* token
@@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
merges.sort(key=_get_start)
for merge_index, (span, attributes) in enumerate(merges):
start = span.start
- end = span.end
spans.append(span)
# House the new merged token where it starts
token = &doc.c[start]
@@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
- doc.tensor = _resize_tensor(doc.tensor,
- [(m[0].start, m[0].end) for m in merges])
+ doc.tensor = _resize_tensor(
+ doc.tensor, [(m[0].start, m[0].end) for m in merges]
+ )
# Memorize span roots and sets dependencies of the newly merged
# tokens to the dependencies of their roots.
span_roots = []
@@ -267,11 +266,11 @@ def _merge(Doc doc, merges):
span_index += 1
if span_index < len(spans) and i == spans[span_index].start:
# First token in a span
- doc.c[i - offset] = doc.c[i] # move token to its place
+ doc.c[i - offset] = doc.c[i] # move token to its place
offset += (spans[span_index].end - spans[span_index].start) - 1
in_span = True
if not in_span:
- doc.c[i - offset] = doc.c[i] # move token to its place
+ doc.c[i - offset] = doc.c[i] # move token to its place
for i in range(doc.length - offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
@@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
if to_process_tensor:
xp = get_array_module(doc.tensor)
if xp is numpy:
- doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
+ doc.tensor = xp.append(
+ doc.tensor,
+ xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
+ axis=0
+ )
else:
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
resized_array = xp.zeros(shape, dtype="float32")
@@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
token.norm = 0 # reset norm
if to_process_tensor:
# setting the tensors of the split tokens to array of zeros
- doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
+ doc.tensor[token_index + i:token_index + i + 1] = \
+ xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
# Update the character offset of the subtokens
if i != 0:
token.idx = orig_token.idx + idx_offset
@@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
def set_token_attrs(Token py_token, attrs):
cdef TokenC* token = py_token.c
cdef const LexemeC* lex = token.lex
- cdef Doc doc = py_token.doc
# Assign attributes
for attr_name, attr_value in attrs.items():
if attr_name == "_": # Set extension attributes
diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index d7f092c94..d9719609c 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
-cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
+cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
cdef class Doc:
@@ -61,7 +61,6 @@ cdef class Doc:
cdef int length
cdef int max_length
-
cdef public object noun_chunks_iterator
cdef object __weakref__
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index 00c7a9d07..55222f8aa 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -8,6 +8,7 @@ from typing import (
List,
Optional,
Protocol,
+ Sequence,
Tuple,
Union,
overload,
@@ -134,7 +135,12 @@ class Doc:
def text(self) -> str: ...
@property
def text_with_ws(self) -> str: ...
- ents: Tuple[Span]
+ # Ideally the getter would output Tuple[Span]
+ # see https://github.com/python/mypy/issues/3004
+ @property
+ def ents(self) -> Sequence[Span]: ...
+ @ents.setter
+ def ents(self, value: Sequence[Span]) -> None: ...
def set_ents(
self,
entities: List[Span],
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 146b276e2..745eb5ff3 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, bounds_check=False, profile=True
+# cython: infer_types=True, bounds_check=False
from typing import Set
cimport cython
@@ -43,14 +43,13 @@ from ..attrs cimport (
attr_id_t,
)
from ..lexeme cimport EMPTY_LEXEME, Lexeme
-from ..typedefs cimport attr_t, flags_t
+from ..typedefs cimport attr_t
from .token cimport Token
from .. import parts_of_speech, schemas, util
from ..attrs import IDS, intify_attr
-from ..compat import copy_reg, pickle
+from ..compat import copy_reg
from ..errors import Errors, Warnings
-from ..morphology import Morphology
from ..util import get_words_and_spaces
from ._retokenize import Retokenizer
from .underscore import Underscore, get_ext_args
@@ -784,7 +783,7 @@ cdef class Doc:
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
- cdef attr_t entity_type, kb_id, ent_id
+ cdef attr_t kb_id, ent_id
cdef int ent_start, ent_end
ent_spans = []
for ent_info in ents:
@@ -987,7 +986,6 @@ cdef class Doc:
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
"""
cdef int i, j
- cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
# Handle scalar/list inputs of strings/ints for py_attr_ids
# See also #3064
@@ -999,8 +997,10 @@ cdef class Doc:
py_attr_ids = [py_attr_ids]
# Allow strings, e.g. 'lemma' or 'LEMMA'
try:
- py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
- for id_ in py_attr_ids]
+ py_attr_ids = [
+ (IDS[id_.upper()] if hasattr(id_, "upper") else id_)
+ for id_ in py_attr_ids
+ ]
except KeyError as msg:
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
@@ -1030,8 +1030,6 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#count_by
"""
cdef int i
- cdef attr_t attr
- cdef size_t count
if counts is None:
counts = Counter()
@@ -1093,7 +1091,6 @@ cdef class Doc:
cdef int i, col
cdef int32_t abs_head_index
cdef attr_id_t attr_id
- cdef TokenC* tokens = self.c
cdef int length = len(array)
if length != len(self):
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
@@ -1225,7 +1222,7 @@ cdef class Doc:
span.label,
span.kb_id,
span.id,
- span.text, # included as a check
+ span.text, # included as a check
))
char_offset += len(doc.text)
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
@@ -1508,7 +1505,6 @@ cdef class Doc:
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The first newly merged token.
"""
- cdef str tag, lemma, ent_type
attr_len = len(attributes)
span_len = len(spans)
if not attr_len == span_len:
@@ -1624,7 +1620,6 @@ cdef class Doc:
for token in char_span[1:]:
token.is_sent_start = False
-
for span_group in doc_json.get("spans", {}):
spans = []
for span in doc_json["spans"][span_group]:
@@ -1656,7 +1651,7 @@ cdef class Doc:
start = token_by_char(self.c, self.length, token_data["start"])
value = token_data["value"]
self[start]._.set(token_attr, value)
-
+
for span_attr in doc_json.get("underscore_span", {}):
if not Span.has_extension(span_attr):
Span.set_extension(span_attr)
@@ -1698,7 +1693,7 @@ cdef class Doc:
token_data["dep"] = token.dep_
token_data["head"] = token.head.i
data["tokens"].append(token_data)
-
+
if self.spans:
data["spans"] = {}
for span_group in self.spans:
@@ -1769,7 +1764,6 @@ cdef class Doc:
output.fill(255)
cdef int i, j, start_idx, end_idx
cdef bytes byte_string
- cdef unsigned char utf8_char
for i, byte_string in enumerate(byte_strings):
j = 0
start_idx = 0
@@ -1822,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
# note: end is exclusive
- cdef TokenC* head
- cdef TokenC* child
cdef int i
# Set number of left/right children to 0. We'll increment it in the loops.
for i in range(start, end):
@@ -1923,7 +1915,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
return -1
-cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
+cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
"""Given a doc and a start and end position defining a set of contiguous
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
LCA[i, j] is the index of the lowest common ancestor among token i and j.
@@ -1936,7 +1928,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
with shape (n, n), where n = len(doc).
"""
- cdef int [:,:] lca_matrix
+ cdef int [:, :] lca_matrix
cdef int j, k
n_tokens= end - start
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx
index 47f0a20d4..6c4ce6ce3 100644
--- a/spacy/tokens/graph.pyx
+++ b/spacy/tokens/graph.pyx
@@ -1,9 +1,10 @@
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
+# cython: profile=False
from typing import Generator, List, Tuple
cimport cython
from cython.operator cimport dereference
-from libc.stdint cimport int32_t, int64_t
+from libc.stdint cimport int32_t
from libcpp.pair cimport pair
from libcpp.unordered_map cimport unordered_map
from libcpp.unordered_set cimport unordered_set
@@ -11,7 +12,6 @@ from libcpp.unordered_set cimport unordered_set
import weakref
from murmurhash.mrmr cimport hash64
-from preshed.maps cimport map_get_unless_missing
from .. import Errors
@@ -28,7 +28,7 @@ from .token import Token
cdef class Edge:
cdef readonly Graph graph
cdef readonly int i
-
+
def __init__(self, Graph graph, int i):
self.graph = graph
self.i = i
@@ -44,7 +44,7 @@ cdef class Edge:
@property
def head(self) -> "Node":
return Node(self.graph, self.graph.c.edges[self.i].head)
-
+
@property
def tail(self) -> "Tail":
return Node(self.graph, self.graph.c.edges[self.i].tail)
@@ -70,7 +70,7 @@ cdef class Node:
def __init__(self, Graph graph, int i):
"""A reference to a node of an annotation graph. Each node is made up of
an ordered set of zero or more token indices.
-
+
Node references are usually created by the Graph object itself, or from
the Node or Edge objects. You usually won't need to instantiate this
class yourself.
@@ -109,13 +109,13 @@ cdef class Node:
@property
def is_none(self) -> bool:
"""Whether the node is a special value, indicating 'none'.
-
+
The NoneNode type is returned by the Graph, Edge and Node objects when
there is no match to a query. It has the same API as Node, but it always
returns NoneNode, NoneEdge or empty lists for its queries.
"""
return False
-
+
@property
def doc(self) -> "Doc":
"""The Doc object that the graph refers to."""
@@ -130,19 +130,19 @@ cdef class Node:
def head(self, i=None, label=None) -> "Node":
"""Get the head of the first matching edge, searching by index, label,
both or neither.
-
+
For instance, `node.head(i=1)` will get the head of the second edge that
this node is a tail of. `node.head(i=1, label="ARG0")` will further
check that the second edge has the label `"ARG0"`.
-
+
If no matching node can be found, the graph's NoneNode is returned.
"""
return self.headed(i=i, label=label)
-
+
def tail(self, i=None, label=None) -> "Node":
"""Get the tail of the first matching edge, searching by index, label,
both or neither.
-
+
If no matching node can be found, the graph's NoneNode is returned.
"""
return self.tailed(i=i, label=label).tail
@@ -171,7 +171,7 @@ cdef class Node:
cdef vector[int] edge_indices
self._find_edges(edge_indices, "head", label)
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
-
+
def tails(self, label=None) -> List["Node"]:
"""Find all matching tails of this node."""
cdef vector[int] edge_indices
@@ -200,7 +200,7 @@ cdef class Node:
return NoneEdge(self.graph)
else:
return Edge(self.graph, idx)
-
+
def tailed(self, i=None, label=None) -> Edge:
"""Find the first matching edge tailed by this node.
If no matching edge can be found, the graph's NoneEdge is returned.
@@ -283,7 +283,7 @@ cdef class NoneEdge(Edge):
def __init__(self, graph):
self.graph = graph
self.i = -1
-
+
@property
def doc(self) -> "Doc":
return self.graph.doc
@@ -291,7 +291,7 @@ cdef class NoneEdge(Edge):
@property
def head(self) -> "NoneNode":
return NoneNode(self.graph)
-
+
@property
def tail(self) -> "NoneNode":
return NoneNode(self.graph)
@@ -319,7 +319,7 @@ cdef class NoneNode(Node):
def __len__(self):
return 0
-
+
@property
def is_none(self):
return -1
@@ -340,14 +340,14 @@ cdef class NoneNode(Node):
def walk_heads(self):
yield from []
-
+
def walk_tails(self):
yield from []
-
+
cdef class Graph:
"""A set of directed labelled relationships between sets of tokens.
-
+
EXAMPLE:
Construction 1
>>> graph = Graph(doc, name="srl")
@@ -372,7 +372,9 @@ cdef class Graph:
>>> assert graph.has_node((0,))
>>> assert graph.has_edge((0,), (1,3), label="agent")
"""
- def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
+ def __init__(
+ self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
+ ):
"""Create a Graph object.
doc (Doc): The Doc object the graph will refer to.
@@ -438,13 +440,11 @@ cdef class Graph:
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
"""Add an edge to the graph, connecting two groups of tokens.
-
+
If there is already an edge for the (head, tail, label) triple, it will
be returned, and no new edge will be created. The weight of the edge
will be updated if a weight is specified.
"""
- label_hash = self.doc.vocab.strings.as_int(label)
- weight_float = weight if weight is not None else 0.0
edge_index = add_edge(
&self.c,
EdgeC(
@@ -478,11 +478,11 @@ cdef class Graph:
def has_edge(self, head, tail, label) -> bool:
"""Check whether a (head, tail, label) triple is an edge in the graph."""
return not self.get_edge(head, tail, label=label).is_none
-
+
def add_node(self, indices) -> Node:
"""Add a node to the graph and return it. Nodes refer to ordered sets
of token indices.
-
+
This method is idempotent: if there is already a node for the given
indices, it is returned without a new node being created.
"""
@@ -510,7 +510,7 @@ cdef class Graph:
return NoneNode(self)
else:
return Node(self, node_index)
-
+
def has_node(self, tuple indices) -> bool:
"""Check whether the graph has a node for the given indices."""
return not self.get_node(indices).is_none
@@ -570,7 +570,7 @@ cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
graph.roots.insert(index)
graph.node_map.insert(pair[hash_t, int](key, index))
return index
-
+
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index 0992a0b66..ea5d07fa4 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
cimport numpy as np
from libc.string cimport memset
@@ -89,4 +90,3 @@ cdef class MorphAnalysis:
def __repr__(self):
return self.to_json()
-
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 59ee21687..af3ba8db5 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -1,5 +1,5 @@
+# cython: profile=False
cimport numpy as np
-from libc.math cimport sqrt
import copy
import warnings
@@ -10,11 +10,10 @@ from thinc.api import get_array_module
from ..attrs cimport *
from ..attrs cimport ORTH, attr_id_t
from ..lexeme cimport Lexeme
-from ..parts_of_speech cimport univ_pos_t
-from ..structs cimport LexemeC, TokenC
+from ..structs cimport TokenC
from ..symbols cimport dep
-from ..typedefs cimport attr_t, flags_t, hash_t
-from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
+from ..typedefs cimport attr_t, hash_t
+from .doc cimport _get_lca_matrix, get_token_attr
from .token cimport Token
from ..errors import Errors, Warnings
@@ -595,7 +594,6 @@ cdef class Span:
"""
return "".join([t.text_with_ws for t in self])
-
@property
def noun_chunks(self):
"""Iterate over the base noun phrases in the span. Yields base
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 48ad4a516..257c907bc 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -1,7 +1,8 @@
+# cython: profile=False
import struct
import weakref
from copy import deepcopy
-from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
+from typing import Iterable, Optional, Union
import srsly
@@ -34,7 +35,7 @@ cdef class SpanGroup:
DOCS: https://spacy.io/api/spangroup
"""
- def __init__(self, doc, *, name="", attrs={}, spans=[]):
+ def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
"""Create a SpanGroup.
doc (Doc): The reference Doc object.
@@ -311,7 +312,7 @@ cdef class SpanGroup:
other_attrs = deepcopy(other_group.attrs)
span_group.attrs.update({
- key: value for key, value in other_attrs.items() \
+ key: value for key, value in other_attrs.items()
if key not in span_group.attrs
})
if len(other_group):
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index fc02ff624..f4e4611df 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -26,7 +26,7 @@ cdef class Token:
cdef Token self = Token.__new__(Token, vocab, doc, offset)
return self
- #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
+ # cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
# cdef TokenC token
# attrs = normalize_attrs(attrs)
@@ -98,12 +98,10 @@ cdef class Token:
elif feat_name == SENT_START:
token.sent_start = value
-
@staticmethod
cdef inline int missing_dep(const TokenC* token) nogil:
return token.dep == MISSING_DEP
-
@staticmethod
cdef inline int missing_head(const TokenC* token) nogil:
return Token.missing_dep(token)
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 6018c3112..9fd4118d6 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -1,13 +1,12 @@
# cython: infer_types=True
+# cython: profile=False
# Compiler crashes on memory view coercion without this. Should report bug.
cimport numpy as np
-from cython.view cimport array as cvarray
np.import_array()
import warnings
-import numpy
from thinc.api import get_array_module
from ..attrs cimport (
@@ -238,7 +237,7 @@ cdef class Token:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
-
+
def has_morph(self):
"""Check whether the token has annotated morph information.
Return False when the morph annotation is unset/missing.
@@ -545,9 +544,9 @@ cdef class Token:
def __get__(self):
if self.i + 1 == len(self.doc):
return True
- elif self.doc[self.i+1].is_sent_start == None:
+ elif self.doc[self.i+1].is_sent_start is None:
return None
- elif self.doc[self.i+1].is_sent_start == True:
+ elif self.doc[self.i+1].is_sent_start is True:
return True
else:
return False
diff --git a/spacy/training/align.pyx b/spacy/training/align.pyx
index 8bd43b048..c68110e30 100644
--- a/spacy/training/align.pyx
+++ b/spacy/training/align.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
import re
from itertools import chain
from typing import List, Tuple
@@ -37,10 +38,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
b2a.append(set())
# Process the alignment at the current position
if A[token_idx_a] == B[token_idx_b] and \
- (char_idx_a == 0 or \
- char_to_token_a[char_idx_a - 1] < token_idx_a) and \
- (char_idx_b == 0 or \
- char_to_token_b[char_idx_b - 1] < token_idx_b):
+ (
+ char_idx_a == 0 or
+ char_to_token_a[char_idx_a - 1] < token_idx_a
+ ) and \
+ (
+ char_idx_b == 0 or
+ char_to_token_b[char_idx_b - 1] < token_idx_b
+ ):
# Current tokens are identical and both character offsets are the
# start of a token (either at the beginning of the document or the
# previous character belongs to a different token)
diff --git a/spacy/training/alignment_array.pyx b/spacy/training/alignment_array.pyx
index b0be1512b..f0eb5cf39 100644
--- a/spacy/training/alignment_array.pyx
+++ b/spacy/training/alignment_array.pyx
@@ -1,3 +1,4 @@
+# cython: profile=False
from typing import List
import numpy
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 6037c15e3..5cc2733a5 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -63,7 +63,7 @@ def create_plain_text_reader(
path: Optional[Path],
min_length: int = 0,
max_length: int = 0,
-) -> Callable[["Language"], Iterable[Doc]]:
+) -> Callable[["Language"], Iterable[Example]]:
"""Iterate Example objects from a file or directory of plain text
UTF-8 files with one line per doc.
diff --git a/spacy/training/example.pyi b/spacy/training/example.pyi
new file mode 100644
index 000000000..06639d70c
--- /dev/null
+++ b/spacy/training/example.pyi
@@ -0,0 +1,66 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+
+from ..tokens import Doc, Span
+from ..vocab import Vocab
+from .alignment import Alignment
+
+def annotations_to_doc(
+ vocab: Vocab,
+ tok_annot: Dict[str, Any],
+ doc_annot: Dict[str, Any],
+) -> Doc: ...
+def validate_examples(
+ examples: Iterable[Example],
+ method: str,
+) -> None: ...
+def validate_get_examples(
+ get_examples: Callable[[], Iterable[Example]],
+ method: str,
+): ...
+
+class Example:
+ x: Doc
+ y: Doc
+
+ def __init__(
+ self,
+ predicted: Doc,
+ reference: Doc,
+ *,
+ alignment: Optional[Alignment] = None,
+ ): ...
+ def __len__(self) -> int: ...
+ @property
+ def predicted(self) -> Doc: ...
+ @predicted.setter
+ def predicted(self, doc: Doc) -> None: ...
+ @property
+ def reference(self) -> Doc: ...
+ @reference.setter
+ def reference(self, doc: Doc) -> None: ...
+ def copy(self) -> Example: ...
+ @classmethod
+ def from_dict(cls, predicted: Doc, example_dict: Dict[str, Any]) -> Example: ...
+ @property
+ def alignment(self) -> Alignment: ...
+ def get_aligned(self, field: str, as_string=False): ...
+ def get_aligned_parse(self, projectivize=True): ...
+ def get_aligned_sent_starts(self): ...
+ def get_aligned_spans_x2y(
+ self, x_spans: Iterable[Span], allow_overlap=False
+ ) -> List[Span]: ...
+ def get_aligned_spans_y2x(
+ self, y_spans: Iterable[Span], allow_overlap=False
+ ) -> List[Span]: ...
+ def get_aligned_ents_and_ner(self) -> Tuple[List[Span], List[str]]: ...
+ def get_aligned_ner(self) -> List[str]: ...
+ def get_matching_ents(self, check_label: bool = True) -> List[Span]: ...
+ def to_dict(self) -> Dict[str, Any]: ...
+ def split_sents(self) -> List[Example]: ...
+ @property
+ def text(self) -> str: ...
+ def __str__(self) -> str: ...
+ def __repr__(self) -> str: ...
+
+def _parse_example_dict_data(example_dict): ...
+def _fix_legacy_dict_data(example_dict): ...
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index abdac23ea..abdcecf71 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -1,4 +1,4 @@
-import warnings
+# cython: profile=False
from collections.abc import Iterable as IterableInstance
import numpy
@@ -31,9 +31,9 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if "entities" in doc_annot:
- _add_entities_to_doc(output, doc_annot["entities"])
+ _add_entities_to_doc(output, doc_annot["entities"])
if "spans" in doc_annot:
- _add_spans_to_doc(output, doc_annot["spans"])
+ _add_spans_to_doc(output, doc_annot["spans"])
if array.size:
output = output.from_array(attrs, array)
# links are currently added with ENT_KB_ID on the token level
@@ -161,7 +161,6 @@ cdef class Example:
self._y_sig = y_sig
return self._cached_alignment
-
def _get_aligned_vectorized(self, align, gold_values):
# Fast path for Doc attributes/fields that are predominantly a single value,
# i.e., TAG, POS, MORPH.
@@ -204,7 +203,6 @@ cdef class Example:
return output.tolist()
-
def _get_aligned_non_vectorized(self, align, gold_values):
# Slower path for fields that return multiple values (resulting
# in ragged arrays that cannot be vectorized trivially).
@@ -221,7 +219,6 @@ cdef class Example:
return output
-
def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute."""
align = self.alignment.x2y
@@ -330,7 +327,7 @@ cdef class Example:
missing=None
)
# Now fill the tokens we can align to O.
- O = 2 # I=1, O=2, B=3
+ O = 2 # I=1, O=2, B=3 # no-cython-lint: E741
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
if x_tags[i] is None:
if ent_iob == O:
@@ -340,7 +337,7 @@ cdef class Example:
return x_ents, x_tags
def get_aligned_ner(self):
- x_ents, x_tags = self.get_aligned_ents_and_ner()
+ _x_ents, x_tags = self.get_aligned_ents_and_ner()
return x_tags
def get_matching_ents(self, check_label=True):
@@ -398,7 +395,6 @@ cdef class Example:
return span_dict
-
def _links_to_dict(self):
links = {}
for ent in self.reference.ents:
@@ -589,6 +585,7 @@ def _fix_legacy_dict_data(example_dict):
"doc_annotation": doc_dict
}
+
def _has_field(annot, field):
if field not in annot:
return False
@@ -625,6 +622,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
ent_types.append("")
return ent_iobs, ent_types
+
def _parse_links(vocab, words, spaces, links):
reference = Doc(vocab, words=words, spaces=spaces)
starts = {token.idx: token.i for token in reference}
diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 1e7b3681d..afbdf4631 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -1,4 +1,4 @@
-import json
+# cython: profile=False
import warnings
import srsly
@@ -6,7 +6,7 @@ import srsly
from .. import util
from ..errors import Warnings
from ..tokens import Doc
-from .iob_utils import offsets_to_biluo_tags, tags_to_entities
+from .iob_utils import offsets_to_biluo_tags
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
@@ -23,7 +23,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
json_doc = {"id": doc_id, "paragraphs": []}
for i, doc in enumerate(docs):
raw = None if doc.has_unknown_spaces else doc.text
- json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
+ json_para = {
+ 'raw': raw,
+ "sentences": [],
+ "cats": [],
+ "entities": [],
+ "links": []
+ }
for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat)
@@ -35,13 +41,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if ent.kb_id_:
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict)
- biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
+ biluo_tags = offsets_to_biluo_tags(
+ doc, json_para["entities"], missing=ner_missing_tag
+ )
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
- json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
+ json_token = {
+ "id": token.i, "orth": token.text, "space": token.whitespace_
+ }
if include_annotation["TAG"]:
json_token["tag"] = token.tag_
if include_annotation["POS"]:
@@ -125,9 +135,14 @@ def json_to_annotations(doc):
else:
sent_starts.append(-1)
if "brackets" in sent:
- brackets.extend((b["first"] + sent_start_i,
- b["last"] + sent_start_i, b["label"])
- for b in sent["brackets"])
+ brackets.extend(
+ (
+ b["first"] + sent_start_i,
+ b["last"] + sent_start_i,
+ b["label"]
+ )
+ for b in sent["brackets"]
+ )
example["token_annotation"] = dict(
ids=ids,
@@ -160,6 +175,7 @@ def json_to_annotations(doc):
)
yield example
+
def json_iterate(bytes utf8_str):
# We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage.
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 82d4ebf24..062170221 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -302,7 +302,7 @@ def read_vectors(
shape = (truncate_vectors, shape[1])
vectors_data = numpy.zeros(shape=shape, dtype="f")
vectors_keys = []
- for i, line in enumerate(tqdm.tqdm(f)):
+ for i, line in enumerate(tqdm.tqdm(f, disable=None)):
line = line.rstrip()
pieces = line.rsplit(" ", vectors_data.shape[1])
word = pieces.pop(0)
diff --git a/spacy/typedefs.pyx b/spacy/typedefs.pyx
index e69de29bb..61bf62038 100644
--- a/spacy/typedefs.pyx
+++ b/spacy/typedefs.pyx
@@ -0,0 +1 @@
+# cython: profile=False
diff --git a/spacy/util.py b/spacy/util.py
index 762699a97..8464e411f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -101,7 +101,6 @@ logger.addHandler(logger_stream_handler)
class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
- PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
class registry(thinc.registry):
@@ -119,6 +118,7 @@ class registry(thinc.registry):
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True)
scorers = catalogue.create("spacy", "scorers", entry_points=True)
+ vectors = catalogue.create("spacy", "vectors", entry_points=True)
# These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily
# load them via the entry points. The "true" factories are added via the
@@ -894,7 +894,7 @@ def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
if "spacy_version" in meta:
if not is_compatible_version(about.__version__, meta["spacy_version"]):
lower_version = get_model_lower_version(meta["spacy_version"])
- lower_version = get_minor_version(lower_version) # type: ignore[arg-type]
+ lower_version = get_base_version(lower_version) # type: ignore[arg-type]
if lower_version is not None:
lower_version = "v" + lower_version
elif "spacy_git_version" in meta:
@@ -974,23 +974,12 @@ def replace_model_node(model: Model, target: Model, replacement: Model) -> None:
def split_command(command: str) -> List[str]:
"""Split a string command using shlex. Handles platform compatibility.
-
command (str) : The command to split
RETURNS (List[str]): The split command.
"""
return shlex.split(command, posix=not is_windows)
-def join_command(command: List[str]) -> str:
- """Join a command using shlex. shlex.join is only available for Python 3.8+,
- so we're using a workaround here.
-
- command (List[str]): The command to join.
- RETURNS (str): The joined command
- """
- return " ".join(shlex.quote(cmd) for cmd in command)
-
-
def run_command(
command: Union[str, List[str]],
*,
@@ -999,7 +988,6 @@ def run_command(
) -> subprocess.CompletedProcess:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
-
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
stdin (Optional[Any]): stdin to read from or None.
@@ -1050,7 +1038,6 @@ def run_command(
@contextmanager
def working_dir(path: Union[str, Path]) -> Iterator[Path]:
"""Change current working directory and returns to previous on exit.
-
path (str / Path): The directory to navigate to.
YIELDS (Path): The absolute path to the current working directory. This
should be used if the block needs to perform actions within the working
@@ -1069,7 +1056,6 @@ def working_dir(path: Union[str, Path]) -> Iterator[Path]:
def make_tempdir() -> Generator[Path, None, None]:
"""Execute a block in a temporary directory and remove the directory and
its contents at the end of the with block.
-
YIELDS (Path): The path of the temp directory.
"""
d = Path(tempfile.mkdtemp())
@@ -1082,20 +1068,14 @@ def make_tempdir() -> Generator[Path, None, None]:
rmfunc(path)
try:
- shutil.rmtree(str(d), onerror=force_remove)
+ if sys.version_info >= (3, 12):
+ shutil.rmtree(str(d), onexc=force_remove)
+ else:
+ shutil.rmtree(str(d), onerror=force_remove)
except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=d, msg=e))
-def is_cwd(path: Union[Path, str]) -> bool:
- """Check whether a path is the current working directory.
-
- path (Union[Path, str]): The directory path.
- RETURNS (bool): Whether the path is the current working directory.
- """
- return str(Path(path).resolve()).lower() == str(Path.cwd().resolve()).lower()
-
-
def is_in_jupyter() -> bool:
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index bf79481b8..6ff99bb59 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -1,13 +1,15 @@
-cimport numpy as np
+# cython: infer_types=True, binding=True
+from typing import Callable
+
from cython.operator cimport dereference as deref
from libc.stdint cimport uint32_t, uint64_t
from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64
-import functools
import warnings
from enum import Enum
-from typing import cast
+from pathlib import Path
+from typing import TYPE_CHECKING, Union, cast
import numpy
import srsly
@@ -23,6 +25,9 @@ from .attrs import IDS
from .errors import Errors, Warnings
from .strings import get_string_id
+if TYPE_CHECKING:
+ from .vocab import Vocab # noqa: F401 # no-cython-lint
+
def unpickle_vectors(bytes_data):
return Vectors().from_bytes(bytes_data)
@@ -37,7 +42,71 @@ class Mode(str, Enum):
return list(cls.__members__.keys())
-cdef class Vectors:
+cdef class BaseVectors:
+ def __init__(self, *, strings=None):
+ # Make sure abstract BaseVectors is not instantiated.
+ if self.__class__ == BaseVectors:
+ raise TypeError(
+ Errors.E1046.format(cls_name=self.__class__.__name__)
+ )
+
+ def __getitem__(self, key):
+ raise NotImplementedError
+
+ def __contains__(self, key):
+ raise NotImplementedError
+
+ def is_full(self):
+ raise NotImplementedError
+
+ def get_batch(self, keys):
+ raise NotImplementedError
+
+ @property
+ def shape(self):
+ raise NotImplementedError
+
+ def __len__(self):
+ raise NotImplementedError
+
+ @property
+ def vectors_length(self):
+ raise NotImplementedError
+
+ @property
+ def size(self):
+ raise NotImplementedError
+
+ def add(self, key, *, vector=None):
+ raise NotImplementedError
+
+ def to_ops(self, ops: Ops):
+ pass
+
+ # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
+ # allow serialization
+ def to_bytes(self, **kwargs):
+ return b""
+
+ def from_bytes(self, data: bytes, **kwargs):
+ return self
+
+ def to_disk(self, path: Union[str, Path], **kwargs):
+ return None
+
+ def from_disk(self, path: Union[str, Path], **kwargs):
+ return self
+
+
+@util.registry.vectors("spacy.Vectors.v1")
+def create_mode_vectors() -> Callable[["Vocab"], BaseVectors]:
+ def vectors_factory(vocab: "Vocab") -> BaseVectors:
+ return Vectors(strings=vocab.strings)
+
+ return vectors_factory
+
+
+cdef class Vectors(BaseVectors):
"""Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
@@ -119,7 +188,7 @@ cdef class Vectors:
if self.mode == Mode.default:
if data is None:
if shape is None:
- shape = (0,0)
+ shape = (0, 0)
ops = get_current_ops()
data = ops.xp.zeros(shape, dtype="f")
self._unset = cppset[int]({i for i in range(data.shape[0])})
@@ -260,11 +329,10 @@ cdef class Vectors:
def __eq__(self, other):
# Check for equality, with faster checks first
return (
- self.shape == other.shape
- and self.key2row == other.key2row
- and self.to_bytes(exclude=["strings"])
- == other.to_bytes(exclude=["strings"])
- )
+ self.shape == other.shape
+ and self.key2row == other.key2row
+ and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
+ )
def resize(self, shape, inplace=False):
"""Resize the underlying vectors array. If inplace=True, the memory
@@ -520,11 +588,12 @@ cdef class Vectors:
# vectors e.g. (10000, 300)
# sims e.g. (1024, 10000)
sims = xp.dot(batch, vectors.T)
- best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
- scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
+ best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:]
+ scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
if sort and n >= 2:
- sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
+ sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
+ xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]
@@ -538,8 +607,12 @@ cdef class Vectors:
numpy_rows = get_current_ops().to_numpy(best_rows)
keys = xp.asarray(
- [[row2key[row] for row in numpy_rows[i] if row in row2key]
- for i in range(len(queries)) ], dtype="uint64")
+ [
+ [row2key[row] for row in numpy_rows[i] if row in row2key]
+ for i in range(len(queries))
+ ],
+ dtype="uint64"
+ )
return (keys, best_rows, scores)
def to_ops(self, ops: Ops):
@@ -582,9 +655,9 @@ cdef class Vectors:
"""
xp = get_array_module(self.data)
if xp is numpy:
- save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
+ save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) # no-cython-lint
else:
- save_array = lambda arr, file_: xp.save(file_, arr)
+ save_array = lambda arr, file_: xp.save(file_, arr) # no-cython-lint
def save_vectors(path):
# the source of numpy.save indicates that the file object is closed after use.
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index 3b0173e3e..43e47af1d 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -32,7 +32,7 @@ cdef class Vocab:
cdef public object writing_system
cdef public object get_noun_chunks
cdef readonly int length
- cdef public object _unused_object # TODO remove in v4, see #9150
+ cdef public object _unused_object # TODO remove in v4, see #9150
cdef public object lex_attr_getters
cdef public object cfg
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 520228b51..4004a70e0 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -1,6 +1,3 @@
-# cython: profile=True
-from libc.string cimport memcpy
-
import functools
import numpy
@@ -19,7 +16,6 @@ from .errors import Errors
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
from .lang.norm_exceptions import BASE_NORMS
from .lookups import Lookups
-from .util import registry
from .vectors import Mode as VectorsMode
from .vectors import Vectors
@@ -51,9 +47,17 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab
"""
- def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
- oov_prob=-20., vectors_name=None, writing_system={},
- get_noun_chunks=None, **deprecated_kwargs):
+ def __init__(
+ self,
+ lex_attr_getters=None,
+ strings=tuple(),
+ lookups=None,
+ oov_prob=-20.,
+ vectors_name=None,
+ writing_system={}, # no-cython-lint
+ get_noun_chunks=None,
+ **deprecated_kwargs
+ ):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to
@@ -89,8 +93,9 @@ cdef class Vocab:
return self._vectors
def __set__(self, vectors):
- for s in vectors.strings:
- self.strings.add(s)
+ if hasattr(vectors, "strings"):
+ for s in vectors.strings:
+ self.strings.add(s)
self._vectors = vectors
self._vectors.strings = self.strings
@@ -150,7 +155,6 @@ cdef class Vocab:
cdef LexemeC* lex
cdef hash_t key = self.strings[string]
lex = self._by_orth.get(key)
- cdef size_t addr
if lex != NULL:
assert lex.orth in self.strings
if lex.orth != key:
@@ -183,13 +187,13 @@ cdef class Vocab:
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
mem = self.mem
- #if len(string) < 3 or self.length < 10000:
+ # if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.length = len(string)
- if self.vectors is not None:
+ if self.vectors is not None and hasattr(self.vectors, "key2row"):
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
else:
lex.id = OOV_RANK
@@ -285,12 +289,17 @@ cdef class Vocab:
@property
def vectors_length(self):
- return self.vectors.shape[1]
+ if hasattr(self.vectors, "shape"):
+ return self.vectors.shape[1]
+ else:
+ return -1
def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same
width, you have to call this to change the size of the vectors.
"""
+ if not isinstance(self.vectors, Vectors):
+ raise ValueError(Errors.E849.format(method="reset_vectors", vectors_type=type(self.vectors)))
if width is not None and shape is not None:
raise ValueError(Errors.E065.format(width=width, shape=shape))
elif shape is not None:
@@ -300,6 +309,8 @@ cdef class Vocab:
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def deduplicate_vectors(self):
+ if not isinstance(self.vectors, Vectors):
+ raise ValueError(Errors.E849.format(method="deduplicate_vectors", vectors_type=type(self.vectors)))
if self.vectors.mode != VectorsMode.default:
raise ValueError(Errors.E858.format(
mode=self.vectors.mode,
@@ -353,6 +364,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#prune_vectors
"""
+ if not isinstance(self.vectors, Vectors):
+ raise ValueError(Errors.E849.format(method="prune_vectors", vectors_type=type(self.vectors)))
if self.vectors.mode != VectorsMode.default:
raise ValueError(Errors.E858.format(
mode=self.vectors.mode,
@@ -463,7 +476,6 @@ cdef class Vocab:
self.lookups.get_table("lexeme_norm"),
)
-
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
@@ -476,7 +488,6 @@ cdef class Vocab:
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
- setters = ["strings", "vectors"]
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
if "vectors" not in exclude:
@@ -495,7 +506,6 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#to_disk
"""
path = util.ensure_path(path)
- getters = ["strings", "vectors"]
if "strings" not in exclude:
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
if "vectors" not in exclude:
diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
index a292194e9..0ec915bd3 100644
--- a/website/docs/api/architectures.mdx
+++ b/website/docs/api/architectures.mdx
@@ -481,6 +481,286 @@ The other arguments are shared between all versions.
+## Curated Transformer architectures {id="curated-trf",source="https://github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/models/architectures.py"}
+
+The following architectures are provided by the package
+[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers).
+See the [usage documentation](/usage/embeddings-transformers#transformers) for
+how to integrate the architectures into your training config.
+
+When loading the model
+[from the Hugging Face Hub](/api/curatedtransformer#hf_trfencoder_loader), the
+model config's parameters must be same as the hyperparameters used by the
+pre-trained model. The
+[`init fill-curated-transformer`](/api/cli#init-fill-curated-transformer) CLI
+command can be used to automatically fill in these values.
+
+### spacy-curated-transformers.AlbertTransformer.v1
+
+Construct an ALBERT transformer model.
+
+| Name | Description |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size` | Vocabulary size. ~~int~~ |
+| `with_spans` | Callback that constructs a span generator model. ~~Callable~~ |
+| `piece_encoder` | The piece encoder to segment input tokens. ~~Model~~ |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~ |
+| `embedding_width` | Width of the embedding representations. ~~int~~ |
+| `hidden_act` | Activation used by the point-wise feed-forward layers. ~~str~~ |
+| `hidden_dropout_prob` | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~ |
+| `hidden_width` | Width of the final representations. ~~int~~ |
+| `intermediate_width` | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps` | Epsilon for layer normalization. ~~float~~ |
+| `max_position_embeddings` | Maximum length of position embeddings. ~~int~~ |
+| `model_max_length` | Maximum length of model inputs. ~~int~~ |
+| `num_attention_heads` | Number of self-attention heads. ~~int~~ |
+| `num_hidden_groups` | Number of layer groups whose constituents share parameters. ~~int~~ |
+| `num_hidden_layers` | Number of hidden layers. ~~int~~ |
+| `padding_idx` | Index of the padding meta-token. ~~int~~ |
+| `type_vocab_size` | Type vocabulary size. ~~int~~ |
+| `mixed_precision` | Use mixed-precision training. ~~bool~~ |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~ |
+| **CREATES** | The model using the architecture ~~Model~~ |
+
+### spacy-curated-transformers.BertTransformer.v1
+
+Construct a BERT transformer model.
+
+| Name | Description |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size` | Vocabulary size. ~~int~~ |
+| `with_spans` | Callback that constructs a span generator model. ~~Callable~~ |
+| `piece_encoder` | The piece encoder to segment input tokens. ~~Model~~ |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~ |
+| `hidden_act` | Activation used by the point-wise feed-forward layers. ~~str~~ |
+| `hidden_dropout_prob` | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~ |
+| `hidden_width` | Width of the final representations. ~~int~~ |
+| `intermediate_width` | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps` | Epsilon for layer normalization. ~~float~~ |
+| `max_position_embeddings` | Maximum length of position embeddings. ~~int~~ |
+| `model_max_length` | Maximum length of model inputs. ~~int~~ |
+| `num_attention_heads` | Number of self-attention heads. ~~int~~ |
+| `num_hidden_layers` | Number of hidden layers. ~~int~~ |
+| `padding_idx` | Index of the padding meta-token. ~~int~~ |
+| `type_vocab_size` | Type vocabulary size. ~~int~~ |
+| `mixed_precision` | Use mixed-precision training. ~~bool~~ |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~ |
+| **CREATES** | The model using the architecture ~~Model~~ |
+
+### spacy-curated-transformers.CamembertTransformer.v1
+
+Construct a CamemBERT transformer model.
+
+| Name | Description |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size` | Vocabulary size. ~~int~~ |
+| `with_spans` | Callback that constructs a span generator model. ~~Callable~~ |
+| `piece_encoder` | The piece encoder to segment input tokens. ~~Model~~ |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~ |
+| `hidden_act` | Activation used by the point-wise feed-forward layers. ~~str~~ |
+| `hidden_dropout_prob` | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~ |
+| `hidden_width` | Width of the final representations. ~~int~~ |
+| `intermediate_width` | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps` | Epsilon for layer normalization. ~~float~~ |
+| `max_position_embeddings` | Maximum length of position embeddings. ~~int~~ |
+| `model_max_length` | Maximum length of model inputs. ~~int~~ |
+| `num_attention_heads` | Number of self-attention heads. ~~int~~ |
+| `num_hidden_layers` | Number of hidden layers. ~~int~~ |
+| `padding_idx` | Index of the padding meta-token. ~~int~~ |
+| `type_vocab_size` | Type vocabulary size. ~~int~~ |
+| `mixed_precision` | Use mixed-precision training. ~~bool~~ |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~ |
+| **CREATES** | The model using the architecture ~~Model~~ |
+
+### spacy-curated-transformers.RobertaTransformer.v1
+
+Construct a RoBERTa transformer model.
+
+| Name | Description |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size` | Vocabulary size. ~~int~~ |
+| `with_spans` | Callback that constructs a span generator model. ~~Callable~~ |
+| `piece_encoder` | The piece encoder to segment input tokens. ~~Model~~ |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~ |
+| `hidden_act` | Activation used by the point-wise feed-forward layers. ~~str~~ |
+| `hidden_dropout_prob` | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~ |
+| `hidden_width` | Width of the final representations. ~~int~~ |
+| `intermediate_width` | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps` | Epsilon for layer normalization. ~~float~~ |
+| `max_position_embeddings` | Maximum length of position embeddings. ~~int~~ |
+| `model_max_length` | Maximum length of model inputs. ~~int~~ |
+| `num_attention_heads` | Number of self-attention heads. ~~int~~ |
+| `num_hidden_layers` | Number of hidden layers. ~~int~~ |
+| `padding_idx` | Index of the padding meta-token. ~~int~~ |
+| `type_vocab_size` | Type vocabulary size. ~~int~~ |
+| `mixed_precision` | Use mixed-precision training. ~~bool~~ |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~ |
+| **CREATES** | The model using the architecture ~~Model~~ |
+
+### spacy-curated-transformers.XlmrTransformer.v1
+
+Construct a XLM-RoBERTa transformer model.
+
+| Name | Description |
+| ------------------------------ | ---------------------------------------------------------------------------------------- |
+| `vocab_size` | Vocabulary size. ~~int~~ |
+| `with_spans` | Callback that constructs a span generator model. ~~Callable~~ |
+| `piece_encoder` | The piece encoder to segment input tokens. ~~Model~~ |
+| `attention_probs_dropout_prob` | Dropout probability of the self-attention layers. ~~float~~ |
+| `hidden_act` | Activation used by the point-wise feed-forward layers. ~~str~~ |
+| `hidden_dropout_prob` | Dropout probability of the point-wise feed-forward and embedding layers. ~~float~~ |
+| `hidden_width` | Width of the final representations. ~~int~~ |
+| `intermediate_width` | Width of the intermediate projection layer in the point-wise feed-forward layer. ~~int~~ |
+| `layer_norm_eps` | Epsilon for layer normalization. ~~float~~ |
+| `max_position_embeddings` | Maximum length of position embeddings. ~~int~~ |
+| `model_max_length` | Maximum length of model inputs. ~~int~~ |
+| `num_attention_heads` | Number of self-attention heads. ~~int~~ |
+| `num_hidden_layers` | Number of hidden layers. ~~int~~ |
+| `padding_idx` | Index of the padding meta-token. ~~int~~ |
+| `type_vocab_size` | Type vocabulary size. ~~int~~ |
+| `mixed_precision` | Use mixed-precision training. ~~bool~~ |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~ |
+| **CREATES** | The model using the architecture ~~Model~~ |
+
+### spacy-curated-transformers.ScalarWeight.v1
+
+Construct a model that accepts a list of transformer layer outputs and returns a
+weighted representation of the same.
+
+| Name | Description |
+| -------------------- | ----------------------------------------------------------------------------- |
+| `num_layers` | Number of transformer hidden layers. ~~int~~ |
+| `dropout_prob` | Dropout probability. ~~float~~ |
+| `mixed_precision` | Use mixed-precision training. ~~bool~~ |
+| `grad_scaler_config` | Configuration passed to the PyTorch gradient scaler. ~~dict~~ |
+| **CREATES** | The model using the architecture ~~Model[ScalarWeightInT, ScalarWeightOutT]~~ |
+
+### spacy-curated-transformers.TransformerLayersListener.v1
+
+Construct a listener layer that communicates with one or more upstream
+Transformer components. This layer extracts the output of the last transformer
+layer and performs pooling over the individual pieces of each `Doc` token,
+returning their corresponding representations. The upstream name should either
+be the wildcard string '\*', or the name of the Transformer component.
+
+In almost all cases, the wildcard string will suffice as there'll only be one
+upstream Transformer component. But in certain situations, e.g: you have
+disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
+but a downstream task requires its own token representations, you could end up
+with more than one Transformer component in the pipeline.
+
+| Name | Description |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `layers` | The number of layers produced by the upstream transformer component, excluding the embedding layer. ~~int~~ |
+| `width` | The width of the vectors produced by the upstream transformer component. ~~int~~ |
+| `pooling` | Model that is used to perform pooling over the piece representations. ~~Model~~ |
+| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~ |
+| `grad_factor` | Factor to multiply gradients with. ~~float~~ |
+| **CREATES** | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+### spacy-curated-transformers.LastTransformerLayerListener.v1
+
+Construct a listener layer that communicates with one or more upstream
+Transformer components. This layer extracts the output of the last transformer
+layer and performs pooling over the individual pieces of each Doc token,
+returning their corresponding representations. The upstream name should either
+be the wildcard string '\*', or the name of the Transformer component.
+
+In almost all cases, the wildcard string will suffice as there'll only be one
+upstream Transformer component. But in certain situations, e.g: you have
+disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
+but a downstream task requires its own token representations, you could end up
+with more than one Transformer component in the pipeline.
+
+| Name | Description |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `width` | The width of the vectors produced by the upstream transformer component. ~~int~~ |
+| `pooling` | Model that is used to perform pooling over the piece representations. ~~Model~~ |
+| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~ |
+| `grad_factor` | Factor to multiply gradients with. ~~float~~ |
+| **CREATES** | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+### spacy-curated-transformers.ScalarWeightingListener.v1
+
+Construct a listener layer that communicates with one or more upstream
+Transformer components. This layer calculates a weighted representation of all
+transformer layer outputs and performs pooling over the individual pieces of
+each Doc token, returning their corresponding representations.
+
+Requires its upstream Transformer components to return all layer outputs from
+their models. The upstream name should either be the wildcard string '\*', or
+the name of the Transformer component.
+
+In almost all cases, the wildcard string will suffice as there'll only be one
+upstream Transformer component. But in certain situations, e.g: you have
+disjoint datasets for certain tasks, or you'd like to use a pre-trained pipeline
+but a downstream task requires its own token representations, you could end up
+with more than one Transformer component in the pipeline.
+
+| Name | Description |
+| --------------- | ---------------------------------------------------------------------------------------------------------------------- |
+| `width` | The width of the vectors produced by the upstream transformer component. ~~int~~ |
+| `weighting` | Model that is used to perform the weighting of the different layer outputs. ~~Model~~ |
+| `pooling` | Model that is used to perform pooling over the piece representations. ~~Model~~ |
+| `upstream_name` | A string to identify the 'upstream' Transformer component to communicate with. ~~str~~ |
+| `grad_factor` | Factor to multiply gradients with. ~~float~~ |
+| **CREATES** | A model that returns the relevant vectors from an upstream transformer component. ~~Model[List[Doc], List[Floats2d]]~~ |
+
+### spacy-curated-transformers.BertWordpieceEncoder.v1
+
+Construct a WordPiece piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers. This encoder
+also splits each token on punctuation characters, as expected by most BERT
+models.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.ByteBpeEncoder.v1
+
+Construct a Byte-BPE piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.CamembertSentencepieceEncoder.v1
+
+Construct a SentencePiece piece encoder model that accepts a list of token
+sequences or documents and returns a corresponding list of piece identifiers
+with CamemBERT post-processing applied.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.CharEncoder.v1
+
+Construct a character piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.SentencepieceEncoder.v1
+
+Construct a SentencePiece piece encoder model that accepts a list of token
+sequences or documents and returns a corresponding list of piece identifiers.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.WordpieceEncoder.v1
+
+Construct a WordPiece piece encoder model that accepts a list of token sequences
+or documents and returns a corresponding list of piece identifiers. This encoder
+also splits each token on punctuation characters, as expected by most BERT
+models.
+
+This model must be separately initialized using an appropriate loader.
+
+### spacy-curated-transformers.XlmrSentencepieceEncoder.v1
+
+Construct a SentencePiece piece encoder model that accepts a list of token
+sequences or documents and returns a corresponding list of piece identifiers
+with XLM-RoBERTa post-processing applied.
+
+This model must be separately initialized using an appropriate loader.
+
## Pretraining architectures {id="pretrain",source="spacy/ml/models/multi_task.py"}
The spacy `pretrain` command lets you initialize a `Tok2Vec` layer in your
diff --git a/website/docs/api/basevectors.mdx b/website/docs/api/basevectors.mdx
new file mode 100644
index 000000000..993b9a33e
--- /dev/null
+++ b/website/docs/api/basevectors.mdx
@@ -0,0 +1,143 @@
+---
+title: BaseVectors
+teaser: Abstract class for word vectors
+tag: class
+source: spacy/vectors.pyx
+version: 3.7
+---
+
+`BaseVectors` is an abstract class to support the development of custom vectors
+implementations.
+
+For use in training with [`StaticVectors`](/api/architectures#staticvectors),
+`get_batch` must be implemented. For improved performance, use efficient
+batching in `get_batch` and implement `to_ops` to copy the vector data to the
+current device. See an example custom implementation for
+[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
+
+## BaseVectors.\_\_init\_\_ {id="init",tag="method"}
+
+Create a new vector store.
+
+| Name | Description |
+| -------------- | --------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `strings` | The string store. A new string store is created if one is not provided. Defaults to `None`. ~~Optional[StringStore]~~ |
+
+## BaseVectors.\_\_getitem\_\_ {id="getitem",tag="method"}
+
+Get a vector by key. If the key is not found in the table, a `KeyError` should
+be raised.
+
+| Name | Description |
+| ----------- | ---------------------------------------------------------------- |
+| `key` | The key to get the vector for. ~~Union[int, str]~~ |
+| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
+
+## BaseVectors.\_\_len\_\_ {id="len",tag="method"}
+
+Return the number of vectors in the table.
+
+| Name | Description |
+| ----------- | ------------------------------------------- |
+| **RETURNS** | The number of vectors in the table. ~~int~~ |
+
+## BaseVectors.\_\_contains\_\_ {id="contains",tag="method"}
+
+Check whether there is a vector entry for the given key.
+
+| Name | Description |
+| ----------- | -------------------------------------------- |
+| `key` | The key to check. ~~int~~ |
+| **RETURNS** | Whether the key has a vector entry. ~~bool~~ |
+
+## BaseVectors.add {id="add",tag="method"}
+
+Add a key to the table, if possible. If no keys can be added, return `-1`.
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------------- |
+| `key` | The key to add. ~~Union[str, int]~~ |
+| **RETURNS** | The row the vector was added to, or `-1` if the operation is not supported. ~~int~~ |
+
+## BaseVectors.shape {id="shape",tag="property"}
+
+Get `(rows, dims)` tuples of number of rows and number of dimensions in the
+vector table.
+
+| Name | Description |
+| ----------- | ------------------------------------------ |
+| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ |
+
+## BaseVectors.size {id="size",tag="property"}
+
+The vector size, i.e. `rows * dims`.
+
+| Name | Description |
+| ----------- | ------------------------ |
+| **RETURNS** | The vector size. ~~int~~ |
+
+## BaseVectors.is_full {id="is_full",tag="property"}
+
+Whether the vectors table is full and no slots are available for new keys.
+
+| Name | Description |
+| ----------- | ------------------------------------------- |
+| **RETURNS** | Whether the vectors table is full. ~~bool~~ |
+
+## BaseVectors.get_batch {id="get_batch",tag="method",version="3.2"}
+
+Get the vectors for the provided keys efficiently as a batch. Required to use
+the vectors with [`StaticVectors`](/api/architectures#StaticVectors) for
+training.
+
+| Name | Description |
+| ------ | --------------------------------------- |
+| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
+
+## BaseVectors.to_ops {id="to_ops",tag="method"}
+
+Dummy method. Implement this to change the embedding matrix to use different
+Thinc ops.
+
+| Name | Description |
+| ----- | -------------------------------------------------------- |
+| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
+
+## BaseVectors.to_disk {id="to_disk",tag="method"}
+
+Dummy method to allow serialization. Implement to save vector data with the
+pipeline.
+
+| Name | Description |
+| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+
+## BaseVectors.from_disk {id="from_disk",tag="method"}
+
+Dummy method to allow serialization. Implement to load vector data from a saved
+pipeline.
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| **RETURNS** | The modified vectors object. ~~BaseVectors~~ |
+
+## BaseVectors.to_bytes {id="to_bytes",tag="method"}
+
+Dummy method to allow serialization. Implement to serialize vector data to a
+binary string.
+
+| Name | Description |
+| ----------- | ---------------------------------------------------- |
+| **RETURNS** | The serialized form of the vectors object. ~~bytes~~ |
+
+## BaseVectors.from_bytes {id="from_bytes",tag="method"}
+
+Dummy method to allow serialization. Implement to load vector data from a binary
+string.
+
+| Name | Description |
+| ----------- | ----------------------------------- |
+| `data` | The data to load from. ~~bytes~~ |
+| **RETURNS** | The vectors object. ~~BaseVectors~~ |
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 6a87f78b8..3ec0081c9 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -7,6 +7,7 @@ menu:
- ['info', 'info']
- ['validate', 'validate']
- ['init', 'init']
+ - ['find-function', 'find-function']
- ['convert', 'convert']
- ['debug', 'debug']
- ['train', 'train']
@@ -185,6 +186,29 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff]
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Complete and auto-filled config file for training. |
+### init fill-curated-transformer {id="init-fill-curated-transformer",version="3.7",tag="command"}
+
+Auto-fill the Hugging Face model hyperpameters and loader parameters of a
+[Curated Transformer](/api/curatedtransformer) pipeline component in a
+[.cfg file](/usage/training#config). The name and revision of the
+[Hugging Face model](https://huggingface.co/models) can either be passed as
+command-line arguments or read from the
+`initialize.components.transformer.encoder_loader` config section.
+
+```bash
+$ python -m spacy init fill-curated-transformer [base_path] [output_file] [--model-name] [--model-revision] [--pipe-name] [--code]
+```
+
+| Name | Description |
+| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
+| `output_file` | Path to output `.cfg` file or "-" to write to stdout so you can pipe it to a file. Defaults to "-" (stdout). ~~Path (positional)~~ |
+| `--model-name`, `-m` | Name of the Hugging Face model. Defaults to the model name from the encoder loader config. ~~Optional[str] (option)~~ |
+| `--model-revision`, `-r` | Revision of the Hugging Face model. Defaults to `main`. ~~Optional[str] (option)~~ |
+| `--pipe-name`, `-n` | Name of the Curated Transformer pipe whose config is to be filled. Defaults to the first transformer pipe. ~~Optional[str] (option)~~ |
+| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| **CREATES** | Complete and auto-filled config file for training. |
+
### init vectors {id="init-vectors",version="3",tag="command"}
Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use
@@ -251,6 +275,27 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The label files. |
+## find-function {id="find-function",version="3.7",tag="command"}
+
+Find the module, path and line number to the file for a given registered
+function. This functionality is helpful to understand where registered
+functions, as used in the config file, are defined.
+
+```bash
+$ python -m spacy find-function [func_name] [--registry]
+```
+
+> #### Example
+>
+> ```bash
+> $ python -m spacy find-function spacy.TextCatBOW.v1
+> ```
+
+| Name | Description |
+| ------------------ | ----------------------------------------------------- |
+| `func_name` | Name of the registered function. ~~str (positional)~~ |
+| `--registry`, `-r` | Name of the catalogue registry. ~~str (option)~~ |
+
## convert {id="convert",tag="command"}
Convert files into spaCy's
@@ -1019,6 +1064,42 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Debugging information. |
+### debug pieces {id="debug-pieces",version="3.7",tag="command"}
+
+Analyze word- or sentencepiece stats.
+
+```bash
+$ python -m spacy debug pieces [config_path] [--code] [--name] [overrides]
+```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `config_path` | Path to config file. ~~Union[Path, str] (positional)~~ |
+| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--name`, `-n` | Name of the Curated Transformer pipe whose config is to be filled. Defaults to the first transformer pipe. ~~Optional[str] (option)~~ |
+| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
+| **PRINTS** | Debugging information. |
+
+
+
+```bash
+$ python -m spacy debug pieces ./config.cfg
+```
+
+```
+========================= Training corpus statistics =========================
+Median token length: 1.0
+Mean token length: 1.54
+Token length range: [1, 13]
+
+======================= Development corpus statistics =======================
+Median token length: 1.0
+Mean token length: 1.44
+Token length range: [1, 8]
+```
+
+
+
## train {id="train",tag="command"}
Train a pipeline. Expects data in spaCy's
@@ -1161,7 +1242,7 @@ skew. To render a sample of dependency parses in a HTML file using the
`--displacy-path` argument.
```bash
-$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
+$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] [--per-component] [--spans-key]
```
| Name | Description |
@@ -1175,6 +1256,7 @@ $ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--
| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ |
| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ |
| `--per-component`, `-P` 3.6 | Whether to return the scores keyed by component name. Defaults to `False`. ~~bool (flag)~~ |
+| `--spans-key`, `-sk` 3.6.2 | Spans key to use when evaluating `Doc.spans`. Defaults to `sc`. ~~str (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
@@ -1651,10 +1733,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose]
> $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl
> ```
-| Name | Description |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
-| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
-| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
-| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
-| **UPLOADS** | The pipeline to the hub. |
+| Name | Description |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ |
+| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ |
+| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ |
+| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ |
+| **UPLOADS** | The pipeline to the hub. |
diff --git a/website/docs/api/curatedtransformer.mdx b/website/docs/api/curatedtransformer.mdx
new file mode 100644
index 000000000..5fdbd86cb
--- /dev/null
+++ b/website/docs/api/curatedtransformer.mdx
@@ -0,0 +1,572 @@
+---
+title: CuratedTransformer
+teaser:
+ Pipeline component for multi-task learning with Curated Transformer models
+tag: class
+source: github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/pipeline/transformer.py
+version: 3.7
+api_base_class: /api/pipe
+api_string_name: curated_transformer
+---
+
+
+
+This component is available via the extension package
+[`spacy-curated-transformers`](https://github.com/explosion/spacy-curated-transformers).
+It exposes the component via entry points, so if you have the package installed,
+using `factory = "curated_transformer"` in your
+[training config](/usage/training#config) will work out-of-the-box.
+
+
+
+This pipeline component lets you use a curated set of transformer models in your
+pipeline. spaCy Curated Transformers currently supports the following model
+types:
+
+- ALBERT
+- BERT
+- CamemBERT
+- RoBERTa
+- XLM-RoBERT
+
+If you want to use another type of model, use
+[spacy-transformers](/api/spacy-transformers), which allows you to use all
+Hugging Face transformer models with spaCy.
+
+You will usually connect downstream components to a shared Curated Transformer
+pipe using one of the Curated Transformer listener layers. This works similarly
+to spaCy's [Tok2Vec](/api/tok2vec), and the
+[Tok2VecListener](/api/architectures/#Tok2VecListener) sublayer. The component
+assigns the output of the transformer to the `Doc`'s extension attributes. To
+access the values, you can use the custom
+[`Doc._.trf_data`](#assigned-attributes) attribute.
+
+For more details, see the [usage documentation](/usage/embeddings-transformers).
+
+## Assigned Attributes {id="assigned-attributes"}
+
+The component sets the following
+[custom extension attribute](/usage/processing-pipeline#custom-components-attributes):
+
+| Location | Value |
+| ---------------- | -------------------------------------------------------------------------- |
+| `Doc._.trf_data` | Curated Transformer outputs for the `Doc` object. ~~DocTransformerOutput~~ |
+
+## Config and Implementation {id="config"}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures#curated-trf) documentation for details
+on the curated transformer architectures and their arguments and
+hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy_curated_transformers.pipeline.transformer import DEFAULT_CONFIG
+>
+> nlp.add_pipe("curated_transformer", config=DEFAULT_CONFIG)
+> ```
+
+| Setting | Description |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [`XlmrTransformer`](/api/architectures#curated-trf). ~~Model~~ |
+| `frozen` | If `True`, the model's weights are frozen and no backpropagation is performed. ~~bool~~ |
+| `all_layer_outputs` | If `True`, the model returns the outputs of all the layers. Otherwise, only the output of the last layer is returned. This must be set to `True` if any of the pipe's downstream listeners require the outputs of all transformer layers. ~~bool~~ |
+
+```python
+https://github.com/explosion/spacy-curated-transformers/blob/main/spacy_curated_transformers/pipeline/transformer.py
+```
+
+## CuratedTransformer.\_\_init\_\_ {id="init",tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> trf = nlp.add_pipe("curated_transformer")
+>
+> # Construction via add_pipe with custom config
+> config = {
+> "model": {
+> "@architectures": "spacy-curated-transformers.XlmrTransformer.v1",
+> "vocab_size": 250002,
+> "num_hidden_layers": 12,
+> "hidden_width": 768,
+> "piece_encoder": {
+> "@architectures": "spacy-curated-transformers.XlmrSentencepieceEncoder.v1"
+> }
+> }
+> }
+> trf = nlp.add_pipe("curated_transformer", config=config)
+>
+> # Construction from class
+> from spacy_curated_transformers import CuratedTransformer
+> trf = CuratedTransformer(nlp.vocab, model)
+> ```
+
+Construct a `CuratedTransformer` component. One or more subsequent spaCy
+components can use the transformer outputs as features in its model, with
+gradients backpropagated to the single shared weights. The activations from the
+transformer are saved in the [`Doc._.trf_data`](#assigned-attributes) extension
+attribute. You can also provide a callback to set additional annotations. In
+your application, you would normally use a shortcut for this and instantiate the
+component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
+
+| Name | Description |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | One of the supported pre-trained transformer models. ~~Model~~ |
+| _keyword-only_ | |
+| `name` | The component instance name. ~~str~~ |
+| `frozen` | If `True`, the model's weights are frozen and no backpropagation is performed. ~~bool~~ |
+| `all_layer_outputs` | If `True`, the model returns the outputs of all the layers. Otherwise, only the output of the last layer is returned. This must be set to `True` if any of the pipe's downstream listeners require the outputs of all transformer layers. ~~bool~~ |
+
+## CuratedTransformer.\_\_call\_\_ {id="call",tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/curatedtransformer#call) and
+[`pipe`](/api/curatedtransformer#pipe) delegate to the
+[`predict`](/api/curatedtransformer#predict) and
+[`set_annotations`](/api/curatedtransformer#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> trf = nlp.add_pipe("curated_transformer")
+> # This usually happens under the hood
+> processed = trf(doc)
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------- |
+| `doc` | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~ |
+
+## CuratedTransformer.pipe {id="pipe",tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/curatedtransformer#call)
+and [`pipe`](/api/curatedtransformer#pipe) delegate to the
+[`predict`](/api/curatedtransformer#predict) and
+[`set_annotations`](/api/curatedtransformer#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> for doc in trf.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------- |
+| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS** | The processed documents in order. ~~Doc~~ |
+
+## CuratedTransformer.initialize {id="initialize",tag="method"}
+
+Initialize the component for training and return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
+function that returns an iterable of [`Example`](/api/example) objects. **At
+least one example should be supplied.** The data examples are used to
+**initialize the model** of the component and can either be the full training
+data or a representative sample. Initialization includes validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name | Description |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ | |
+| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
+| `encoder_loader` | Initialization callback for the transformer model. ~~Optional[Callable]~~ |
+| `piece_loader` | Initialization callback for the input piece encoder. ~~Optional[Callable]~~ |
+
+## CuratedTransformer.predict {id="predict",tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
+modifying them.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> scores = trf.predict([doc1, doc2])
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------- |
+| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The model's prediction for each document. |
+
+## CuratedTransformer.set_annotations {id="set_annotations",tag="method"}
+
+Assign the extracted features to the `Doc` objects. By default, the
+[`DocTransformerOutput`](/api/curatedtransformer#doctransformeroutput) object is
+written to the [`Doc._.trf_data`](#assigned-attributes) attribute. Your
+`set_extra_annotations` callback is then called, if provided.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> scores = trf.predict(docs)
+> trf.set_annotations(docs, scores)
+> ```
+
+| Name | Description |
+| -------- | ------------------------------------------------------------ |
+| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
+| `scores` | The scores to set, produced by `CuratedTransformer.predict`. |
+
+## CuratedTransformer.update {id="update",tag="method"}
+
+Prepare for an update to the transformer.
+
+Like the [`Tok2Vec`](api/tok2vec) component, the `CuratedTransformer` component
+is unusual in that it does not receive "gold standard" annotations to calculate
+a weight update. The optimal output of the transformer data is unknown; it's a
+hidden layer inside the network that is updated by backpropagating from output
+layers.
+
+The `CuratedTransformer` component therefore does not perform a weight update
+during its own `update` method. Instead, it runs its transformer model and
+communicates the output and the backpropagation callback to any downstream
+components that have been connected to it via the transformer listener sublayer.
+If there are multiple listeners, the last layer will actually backprop to the
+transformer and call the optimizer, while the others simply increment the
+gradients.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> optimizer = nlp.initialize()
+> losses = trf.update(examples, sgd=optimizer)
+> ```
+
+| Name | Description |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `drop` | The dropout rate. ~~float~~ |
+| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
+| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
+
+## CuratedTransformer.create_optimizer {id="create_optimizer",tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> optimizer = trf.create_optimizer()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## CuratedTransformer.use_params {id="use_params",tag="method, contextmanager"}
+
+Modify the pipe's model to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> with trf.use_params(optimizer.averages):
+> trf.to_disk("/best_model")
+> ```
+
+| Name | Description |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## CuratedTransformer.to_disk {id="to_disk",tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf.to_disk("/path/to/transformer")
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+
+## CuratedTransformer.from_disk {id="from_disk",tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf.from_disk("/path/to/transformer")
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `CuratedTransformer` object. ~~CuratedTransformer~~ |
+
+## CuratedTransformer.to_bytes {id="to_bytes",tag="method"}
+
+> #### Example
+>
+> ```python
+> trf = nlp.add_pipe("curated_transformer")
+> trf_bytes = trf.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The serialized form of the `CuratedTransformer` object. ~~bytes~~ |
+
+## CuratedTransformer.from_bytes {id="from_bytes",tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> trf_bytes = trf.to_bytes()
+> trf = nlp.add_pipe("curated_transformer")
+> trf.from_bytes(trf_bytes)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data` | The data to load from. ~~bytes~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The `CuratedTransformer` object. ~~CuratedTransformer~~ |
+
+## Serialization Fields {id="serialization-fields"}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = trf.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |
+
+## DocTransformerOutput {id="doctransformeroutput",tag="dataclass"}
+
+Curated Transformer outputs for one `Doc` object. Stores the dense
+representations generated by the transformer for each piece identifier. Piece
+identifiers are grouped by token. Instances of this class are typically assigned
+to the [`Doc._.trf_data`](/api/curatedtransformer#assigned-attributes) extension
+attribute.
+
+| Name | Description |
+| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `all_outputs` | List of `Ragged` tensors that correspends to outputs of the different transformer layers. Each tensor element corresponds to a piece identifier's representation. ~~List[Ragged]~~ |
+| `last_layer_only` | If only the last transformer layer's outputs are preserved. ~~bool~~ |
+
+### DocTransformerOutput.embedding_layer {id="doctransformeroutput-embeddinglayer",tag="property"}
+
+Return the output of the transformer's embedding layer or `None` if
+`last_layer_only` is `True`.
+
+| Name | Description |
+| ----------- | -------------------------------------------- |
+| **RETURNS** | Embedding layer output. ~~Optional[Ragged]~~ |
+
+### DocTransformerOutput.last_hidden_layer_state {id="doctransformeroutput-lasthiddenlayerstate",tag="property"}
+
+Return the output of the transformer's last hidden layer.
+
+| Name | Description |
+| ----------- | ------------------------------------ |
+| **RETURNS** | Last hidden layer output. ~~Ragged~~ |
+
+### DocTransformerOutput.all_hidden_layer_states {id="doctransformeroutput-allhiddenlayerstates",tag="property"}
+
+Return the outputs of all transformer layers (excluding the embedding layer).
+
+| Name | Description |
+| ----------- | -------------------------------------- |
+| **RETURNS** | Hidden layer outputs. ~~List[Ragged]~~ |
+
+### DocTransformerOutput.num_outputs {id="doctransformeroutput-numoutputs",tag="property"}
+
+Return the number of layer outputs stored in the `DocTransformerOutput` instance
+(including the embedding layer).
+
+| Name | Description |
+| ----------- | -------------------------- |
+| **RETURNS** | Numbef of outputs. ~~int~~ |
+
+## Span Getters {id="span_getters",source="github.com/explosion/spacy-transformers/blob/master/spacy_curated_transformers/span_getters.py"}
+
+Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
+return a lists of [`Span`](/api/span) objects for each doc to be processed by
+the transformer. This is used to manage long documents by cutting them into
+smaller sequences before running the transformer. The spans are allowed to
+overlap, and you can also omit sections of the `Doc` if they are not relevant.
+Span getters can be referenced in the
+`[components.transformer.model.with_spans]` block of the config to customize the
+sequences processed by the transformer.
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------- |
+| `docs` | A batch of `Doc` objects. ~~Iterable[Doc]~~ |
+| **RETURNS** | The spans to process by the transformer. ~~List[List[Span]]~~ |
+
+### WithStridedSpans.v1 {id="strided_spans",tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.with_spans]
+> @architectures = "spacy-curated-transformers.WithStridedSpans.v1"
+> stride = 96
+> window = 128
+> ```
+
+Create a span getter for strided spans. If you set the `window` and `stride` to
+the same value, the spans will cover each token once. Setting `stride` lower
+than `window` will allow for an overlap, so that some tokens are counted twice.
+This can be desirable, because it allows all tokens to have both a left and
+right context.
+
+| Name | Description |
+| -------- | ------------------------ |
+| `window` | The window size. ~~int~~ |
+| `stride` | The stride size. ~~int~~ |
+
+## Model Loaders
+
+[Curated Transformer models](/api/architectures#curated-trf) are constructed
+with default hyperparameters and randomized weights when the pipeline is
+created. To load the weights of an existing pre-trained model into the pipeline,
+one of the following loader callbacks can be used. The pre-trained model must
+have the same hyperparameters as the model used by the pipeline.
+
+### HFTransformerEncoderLoader.v1 {id="hf_trfencoder_loader",tag="registered_function"}
+
+Construct a callback that initializes a supported transformer model with weights
+from a corresponding HuggingFace model.
+
+| Name | Description |
+| ---------- | ------------------------------------------ |
+| `name` | Name of the HuggingFace model. ~~str~~ |
+| `revision` | Name of the model revision/branch. ~~str~~ |
+
+### PyTorchCheckpointLoader.v1 {id="pytorch_checkpoint_loader",tag="registered_function"}
+
+Construct a callback that initializes a supported transformer model with weights
+from a PyTorch checkpoint.
+
+| Name | Description |
+| ------ | ---------------------------------------- |
+| `path` | Path to the PyTorch checkpoint. ~~Path~~ |
+
+## Tokenizer Loaders
+
+[Curated Transformer models](/api/architectures#curated-trf) must be paired with
+a matching tokenizer (piece encoder) model in a spaCy pipeline. As with the
+transformer models, tokenizers are constructed with an empty vocabulary during
+pipeline creation - They need to be initialized with an appropriate loader
+before use in training/inference.
+
+### ByteBPELoader.v1 {id="bytebpe_loader",tag="registered_function"}
+
+Construct a callback that initializes a Byte-BPE piece encoder model.
+
+| Name | Description |
+| ------------- | ------------------------------------- |
+| `vocab_path` | Path to the vocabulary file. ~~Path~~ |
+| `merges_path` | Path to the merges file. ~~Path~~ |
+
+### CharEncoderLoader.v1 {id="charencoder_loader",tag="registered_function"}
+
+Construct a callback that initializes a character piece encoder model.
+
+| Name | Description |
+| ----------- | --------------------------------------------------------------------------- |
+| `path` | Path to the serialized character model. ~~Path~~ |
+| `bos_piece` | Piece used as a beginning-of-sentence token. Defaults to `"[BOS]"`. ~~str~~ |
+| `eos_piece` | Piece used as a end-of-sentence token. Defaults to `"[EOS]"`. ~~str~~ |
+| `unk_piece` | Piece used as a stand-in for unknown tokens. Defaults to `"[UNK]"`. ~~str~~ |
+| `normalize` | Unicode normalization form to use. Defaults to `"NFKC"`. ~~str~~ |
+
+### HFPieceEncoderLoader.v1 {id="hf_pieceencoder_loader",tag="registered_function"}
+
+Construct a callback that initializes a HuggingFace piece encoder model. Used in
+conjunction with the HuggingFace model loader.
+
+| Name | Description |
+| ---------- | ------------------------------------------ |
+| `name` | Name of the HuggingFace model. ~~str~~ |
+| `revision` | Name of the model revision/branch. ~~str~~ |
+
+### SentencepieceLoader.v1 {id="sentencepiece_loader",tag="registered_function"}
+
+Construct a callback that initializes a SentencePiece piece encoder model.
+
+| Name | Description |
+| ------ | ---------------------------------------------------- |
+| `path` | Path to the serialized SentencePiece model. ~~Path~~ |
+
+### WordpieceLoader.v1 {id="wordpiece_loader",tag="registered_function"}
+
+Construct a callback that initializes a WordPiece piece encoder model.
+
+| Name | Description |
+| ------ | ------------------------------------------------ |
+| `path` | Path to the serialized WordPiece model. ~~Path~~ |
+
+## Callbacks
+
+### gradual_transformer_unfreezing.v1 {id="gradual_transformer_unfreezing",tag="registered_function"}
+
+Construct a callback that can be used to gradually unfreeze the weights of one
+or more Transformer components during training. This can be used to prevent
+catastrophic forgetting during fine-tuning.
+
+| Name | Description |
+| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `target_pipes` | A dictionary whose keys and values correspond to the names of Transformer components and the training step at which they should be unfrozen respectively. ~~Dict[str, int]~~ |
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index de23156b9..068e8ea78 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -856,7 +856,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or
training a pipeline with components sourced from an existing pipeline: if
multiple components (e.g. tagger, parser, NER) listen to the same
token-to-vector component, but some of them are frozen and not updated, their
-performance may degrade significally as the token-to-vector component is updated
+performance may degrade significantly as the token-to-vector component is updated
with new data. To prevent this, listeners can be replaced with a standalone
token-to-vector layer that is owned by the component and doesn't change if the
component isn't updated.
diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
index 3b4633f02..9cdc0c8ab 100644
--- a/website/docs/api/top-level.mdx
+++ b/website/docs/api/top-level.mdx
@@ -343,6 +343,130 @@ use with the `manual=True` argument in `displacy.render`.
| `options` | Span-specific visualisation options. ~~Dict[str, Any]~~ |
| **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
+### Visualizer data structures {id="displacy_structures"}
+
+You can use displaCy's data format to manually render data. This can be useful
+if you want to visualize output from other libraries. You can find examples of
+displaCy's different data formats below.
+
+> #### DEP example data structure
+>
+> ```json
+> {
+> "words": [
+> { "text": "This", "tag": "DT" },
+> { "text": "is", "tag": "VBZ" },
+> { "text": "a", "tag": "DT" },
+> { "text": "sentence", "tag": "NN" }
+> ],
+> "arcs": [
+> { "start": 0, "end": 1, "label": "nsubj", "dir": "left" },
+> { "start": 2, "end": 3, "label": "det", "dir": "left" },
+> { "start": 1, "end": 3, "label": "attr", "dir": "right" }
+> ]
+> }
+> ```
+
+#### Dependency Visualizer data structure {id="structure-dep"}
+
+| Dictionary Key | Description |
+| -------------- | ----------------------------------------------------------------------------------------------------------- |
+| `words` | List of dictionaries describing a word token (see structure below). ~~List[Dict[str, Any]]~~ |
+| `arcs` | List of dictionaries describing the relations between words (see structure below). ~~List[Dict[str, Any]]~~ |
+| _Optional_ | |
+| `title` | Title of the visualization. ~~Optional[str]~~ |
+| `settings` | Dependency Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+
+
+| Dictionary Key | Description |
+| -------------- | ---------------------------------------- |
+| `text` | Text content of the word. ~~str~~ |
+| `tag` | Fine-grained part-of-speech. ~~str~~ |
+| `lemma` | Base form of the word. ~~Optional[str]~~ |
+
+
+
+
+
+| Dictionary Key | Description |
+| -------------- | ---------------------------------------------------- |
+| `start` | The index of the starting token. ~~int~~ |
+| `end` | The index of the ending token. ~~int~~ |
+| `label` | The type of dependency relation. ~~str~~ |
+| `dir` | Direction of the relation (`left`, `right`). ~~str~~ |
+
+
+
+> #### ENT example data structure
+>
+> ```json
+> {
+> "text": "But Google is starting from behind.",
+> "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
+> }
+> ```
+
+#### Named Entity Recognition data structure {id="structure-ent"}
+
+| Dictionary Key | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `text` | String representation of the document text. ~~str~~ |
+| `ents` | List of dictionaries describing entities (see structure below). ~~List[Dict[str, Any]]~~ |
+| _Optional_ | |
+| `title` | Title of the visualization. ~~Optional[str]~~ |
+| `settings` | Entity Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+
+
+| Dictionary Key | Description |
+| -------------- | ---------------------------------------------------------------------- |
+| `start` | The index of the first character of the entity. ~~int~~ |
+| `end` | The index of the last character of the entity. (not inclusive) ~~int~~ |
+| `label` | Label attached to the entity. ~~str~~ |
+| _Optional_ | |
+| `kb_id` | `KnowledgeBase` ID. ~~str~~ |
+| `kb_url` | `KnowledgeBase` URL. ~~str~~ |
+
+
+
+> #### SPAN example data structure
+>
+> ```json
+> {
+> "text": "Welcome to the Bank of China.",
+> "spans": [
+> { "start_token": 3, "end_token": 6, "label": "ORG" },
+> { "start_token": 5, "end_token": 6, "label": "GPE" }
+> ],
+> "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."]
+> }
+> ```
+
+#### Span Classification data structure {id="structure-span"}
+
+| Dictionary Key | Description |
+| -------------- | ----------------------------------------------------------------------------------------- |
+| `text` | String representation of the document text. ~~str~~ |
+| `spans` | List of dictionaries describing spans (see structure below). ~~List[Dict[str, Any]]~~ |
+| `tokens` | List of word tokens. ~~List[str]~~ |
+| _Optional_ | |
+| `title` | Title of the visualization. ~~Optional[str]~~ |
+| `settings` | Span Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+
+
+| Dictionary Key | Description |
+| -------------- | ------------------------------------------------------------- |
+| `start_token` | The index of the first token of the span in `tokens`. ~~int~~ |
+| `end_token` | The index of the last token of the span in `tokens`. ~~int~~ |
+| `label` | Label attached to the span. ~~str~~ |
+| _Optional_ | |
+| `kb_id` | `KnowledgeBase` ID. ~~str~~ |
+| `kb_url` | `KnowledgeBase` URL. ~~str~~ |
+
+
+
### Visualizer options {id="displacy_options"}
The `options` argument lets you specify additional settings for each visualizer.
diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx
index fa4cd0c7a..0e92eb12b 100644
--- a/website/docs/api/vectors.mdx
+++ b/website/docs/api/vectors.mdx
@@ -297,10 +297,9 @@ The vector size, i.e. `rows * dims`.
## Vectors.is_full {id="is_full",tag="property"}
-Whether the vectors table is full and has no slots are available for new keys.
-If a table is full, it can be resized using
-[`Vectors.resize`](/api/vectors#resize). In `floret` mode, the table is always
-full and cannot be resized.
+Whether the vectors table is full and no slots are available for new keys. If a
+table is full, it can be resized using [`Vectors.resize`](/api/vectors#resize).
+In `floret` mode, the table is always full and cannot be resized.
> #### Example
>
@@ -441,7 +440,7 @@ Load state from a binary string.
> #### Example
>
> ```python
-> fron spacy.vectors import Vectors
+> from spacy.vectors import Vectors
> vectors_bytes = vectors.to_bytes()
> new_vectors = Vectors(StringStore())
> new_vectors.from_bytes(vectors_bytes)
diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx
index 5f1e5b817..2bd2856b6 100644
--- a/website/docs/usage/embeddings-transformers.mdx
+++ b/website/docs/usage/embeddings-transformers.mdx
@@ -632,6 +632,165 @@ def MyCustomVectors(
)
```
+#### Creating a custom vectors implementation {id="custom-vectors",version="3.7"}
+
+You can specify a custom registered vectors class under `[nlp.vectors]` in order
+to use static vectors in formats other than the ones supported by
+[`Vectors`](/api/vectors). Extend the abstract [`BaseVectors`](/api/basevectors)
+class to implement your custom vectors.
+
+As an example, the following `BPEmbVectors` class implements support for
+[BPEmb subword embeddings](https://bpemb.h-its.org/):
+
+```python
+# requires: pip install bpemb
+import warnings
+from pathlib import Path
+from typing import Callable, Optional, cast
+
+from bpemb import BPEmb
+from thinc.api import Ops, get_current_ops
+from thinc.backends import get_array_ops
+from thinc.types import Floats2d
+
+from spacy.strings import StringStore
+from spacy.util import registry
+from spacy.vectors import BaseVectors
+from spacy.vocab import Vocab
+
+
+class BPEmbVectors(BaseVectors):
+ def __init__(
+ self,
+ *,
+ strings: Optional[StringStore] = None,
+ lang: Optional[str] = None,
+ vs: Optional[int] = None,
+ dim: Optional[int] = None,
+ cache_dir: Optional[Path] = None,
+ encode_extra_options: Optional[str] = None,
+ model_file: Optional[Path] = None,
+ emb_file: Optional[Path] = None,
+ ):
+ kwargs = {}
+ if lang is not None:
+ kwargs["lang"] = lang
+ if vs is not None:
+ kwargs["vs"] = vs
+ if dim is not None:
+ kwargs["dim"] = dim
+ if cache_dir is not None:
+ kwargs["cache_dir"] = cache_dir
+ if encode_extra_options is not None:
+ kwargs["encode_extra_options"] = encode_extra_options
+ if model_file is not None:
+ kwargs["model_file"] = model_file
+ if emb_file is not None:
+ kwargs["emb_file"] = emb_file
+ self.bpemb = BPEmb(**kwargs)
+ self.strings = strings
+ self.name = repr(self.bpemb)
+ self.n_keys = -1
+ self.mode = "BPEmb"
+ self.to_ops(get_current_ops())
+
+ def __contains__(self, key):
+ return True
+
+ def is_full(self):
+ return True
+
+ def add(self, key, *, vector=None, row=None):
+ warnings.warn(
+ (
+ "Skipping BPEmbVectors.add: the bpemb vector table cannot be "
+ "modified. Vectors are calculated from bytepieces."
+ )
+ )
+ return -1
+
+ def __getitem__(self, key):
+ return self.get_batch([key])[0]
+
+ def get_batch(self, keys):
+ keys = [self.strings.as_string(key) for key in keys]
+ bp_ids = self.bpemb.encode_ids(keys)
+ ops = get_array_ops(self.bpemb.emb.vectors)
+ indices = ops.asarray(ops.xp.hstack(bp_ids), dtype="int32")
+ lengths = ops.asarray([len(x) for x in bp_ids], dtype="int32")
+ vecs = ops.reduce_mean(cast(Floats2d, self.bpemb.emb.vectors[indices]), lengths)
+ return vecs
+
+ @property
+ def shape(self):
+ return self.bpemb.vectors.shape
+
+ def __len__(self):
+ return self.shape[0]
+
+ @property
+ def vectors_length(self):
+ return self.shape[1]
+
+ @property
+ def size(self):
+ return self.bpemb.vectors.size
+
+ def to_ops(self, ops: Ops):
+ self.bpemb.emb.vectors = ops.asarray(self.bpemb.emb.vectors)
+
+
+@registry.vectors("BPEmbVectors.v1")
+def create_bpemb_vectors(
+ lang: Optional[str] = "multi",
+ vs: Optional[int] = None,
+ dim: Optional[int] = None,
+ cache_dir: Optional[Path] = None,
+ encode_extra_options: Optional[str] = None,
+ model_file: Optional[Path] = None,
+ emb_file: Optional[Path] = None,
+) -> Callable[[Vocab], BPEmbVectors]:
+ def bpemb_vectors_factory(vocab: Vocab) -> BPEmbVectors:
+ return BPEmbVectors(
+ strings=vocab.strings,
+ lang=lang,
+ vs=vs,
+ dim=dim,
+ cache_dir=cache_dir,
+ encode_extra_options=encode_extra_options,
+ model_file=model_file,
+ emb_file=emb_file,
+ )
+
+ return bpemb_vectors_factory
+```
+
+
+
+Note that the serialization methods are not implemented, so the embeddings are
+loaded from your local cache or downloaded by `BPEmb` each time the pipeline is
+loaded.
+
+
+
+To use this in your pipeline, specify this registered function under
+`[nlp.vectors]` in your config:
+
+```ini
+[nlp.vectors]
+@vectors = "BPEmbVectors.v1"
+lang = "en"
+```
+
+Or specify it when creating a blank pipeline:
+
+```python
+nlp = spacy.blank("en", config={"nlp.vectors": {"@vectors": "BPEmbVectors.v1", "lang": "en"}})
+```
+
+Remember to include this code with `--code` when using
+[`spacy train`](/api/cli#train) and [`spacy package`](/api/cli#package).
+
## Pretraining {id="pretraining"}
The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx
index 414968d42..c50e9db6c 100644
--- a/website/docs/usage/index.mdx
+++ b/website/docs/usage/index.mdx
@@ -20,7 +20,7 @@ menu:
## Installation instructions {id="installation"}
-spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**,
+spaCy is compatible with **64-bit CPython 3.7+** and runs on **Unix/Linux**,
**macOS/OS X** and **Windows**. The latest spaCy releases are available over
[pip](https://pypi.python.org/pypi/spacy) and
[conda](https://anaconda.org/conda-forge/spacy).
diff --git a/website/docs/usage/v3-7.mdx b/website/docs/usage/v3-7.mdx
new file mode 100644
index 000000000..76fc9530f
--- /dev/null
+++ b/website/docs/usage/v3-7.mdx
@@ -0,0 +1,140 @@
+---
+title: What's New in v3.7
+teaser: New features and how to upgrade
+menu:
+ - ['New Features', 'features']
+ - ['Upgrading Notes', 'upgrading']
+---
+
+## New features {id="features",hidden="true"}
+
+spaCy v3.7 adds support for Python 3.12, introduces the new standalone library
+[Weasel](https://github.com/explosion/weasel) for project workflows, and updates
+the transformer-based trained pipelines to use our new
+[Curated Transformers](https://github.com/explosion/curated-transformers)
+library.
+
+This release drops support for Python 3.6.
+
+### Weasel {id="weasel"}
+
+The [spaCy projects](/usage/projects) functionality has been moved into a new
+standalone library [Weasel](https://github.com/explosion/weasel). This brings
+minor changes to spaCy-specific settings in spaCy projects (see
+[upgrading](#upgrading) below), but also makes it possible to use the same
+workflow functionality outside of spaCy.
+
+All `spacy project` commands should run as before, just now they're using Weasel
+under the hood.
+
+
+
+Remote storage for spaCy projects is not yet supported for Python 3.12. Use
+Python 3.11 or earlier for remote storage.
+
+
+
+### Registered vectors {id="custom-vectors"}
+
+You can specify a custom registered vectors class under `[nlp.vectors]` in order
+to use static vectors in formats other than the ones supported by
+[`Vectors`](/api/vectors). To implement your custom vectors, extend the abstract
+class [`BaseVectors`](/api/basevectors). See an example using
+[BPEmb subword embeddings](/usage/embeddings-transformers#custom-vectors).
+
+### Additional features and improvements {id="additional-features-and-improvements"}
+
+- Add support for Python 3.12.
+- Extend to Thinc v8.2.
+- Extend `transformers` extra to `spacy-transformers` v1.3.
+- Add `--spans-key` option for CLI evaluation with `spacy benchmark accuracy`.
+- Load the CLI module lazily for `spacy.info`.
+- Add type stubs for for `spacy.training.example`.
+- Warn for unsupported pattern keys in dependency matcher.
+- `Language.replace_listeners`: Pass the replaced listener and the `tok2vec`
+ pipe to the callback in order to support `spacy-curated-transformers`.
+- Always use `tqdm` with `disable=None` in order to disable output in
+ non-interactive environments.
+- Language updates:
+ - Add left and right pointing angle brackets as punctuation to ancient Greek.
+ - Update example sentences for Turkish.
+- Package setup updates:
+ - Update NumPy build constraints for NumPy 1.25+. For Python 3.9+, it is no
+ longer necessary to set build constraints while building binary wheels.
+ - Refactor Cython profiling in order to disable profiling for Python 3.12 in
+ the package setup, since Cython does not currently support profiling for
+ Python 3.12.
+
+## Trained pipelines {id="pipelines"}
+
+### Pipeline updates {id="pipeline-updates"}
+
+The transformer-based `trf` pipelines have been updated to use our new
+[Curated Transformers](https://github.com/explosion/curated-transformers)
+library using the Thinc model wrappers and pipeline component from
+[spaCy Curated Transformers](https://github.com/explosion/spacy-curated-transformers).
+
+## Notes about upgrading from v3.6 {id="upgrading"}
+
+This release drops support for Python 3.6, drops mypy checks for Python 3.7 and
+removes the `ray` extra. In addition there are several minor changes for spaCy
+projects described in the following section.
+
+### Backwards incompatibilities for spaCy Projects {id="upgrading-projects"}
+
+`spacy project` has a few backwards incompatibilities due to the transition to
+the standalone library [Weasel](https://github.com/explosion/weasel), which is
+not as tightly coupled to spaCy. Weasel produces warnings when it detects older
+spaCy-specific settings in your environment or project config.
+
+- Support for the `spacy_version` configuration key has been dropped.
+- Support for the `check_requirements` configuration key has been dropped due to
+ the deprecation of `pkg_resources`.
+- The `SPACY_CONFIG_OVERRIDES` environment variable is no longer checked. You
+ can set configuration overrides using `WEASEL_CONFIG_OVERRIDES`.
+- Support for `SPACY_PROJECT_USE_GIT_VERSION` environment variable has been
+ dropped.
+- Error codes are now Weasel-specific and do not follow spaCy error codes.
+
+### Pipeline package version compatibility {id="version-compat"}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with an earlier version of spaCy
+v3, you will see a warning telling you that the pipeline may be incompatible.
+This doesn't necessarily have to be true, but we recommend running your
+pipelines against your test suite or evaluation data to make sure there are no
+unexpected results.
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.6.0,<3.7.0",
++ "spacy_version": ">=3.6.0,<3.8.0",
+```
+
+### Updating v3.6 configs
+
+To update a config from spaCy v3.6 with the new v3.7 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.6.cfg config-v3.7.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index 1ac931753..e73c4a16a 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -349,7 +349,8 @@ or
[SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
If you set `manual=True` on either `render()` or `serve()`, you can pass in data
in displaCy's format as a dictionary (instead of `Doc` objects). There are
-helper functions for converting `Doc` objects to displaCy's format for use with
+helper functions for converting `Doc` objects to
+[displaCy's format](/api/top-level#displacy_structures) for use with
`manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
[`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and
[`displacy.parse_spans`](/api/top-level#displacy.parse_spans).
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 033f71b12..24213ed12 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -15,7 +15,8 @@
{ "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" },
{ "text": "New in v3.5", "url": "/usage/v3-5" },
- { "text": "New in v3.6", "url": "/usage/v3-6" }
+ { "text": "New in v3.6", "url": "/usage/v3-6" },
+ { "text": "New in v3.7", "url": "/usage/v3-7" }
]
},
{
@@ -100,6 +101,7 @@
"items": [
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
{ "text": "CoreferenceResolver", "url": "/api/coref" },
+ { "text": "CuratedTransformer", "url": "/api/curatedtransformer" },
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
{ "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
{ "text": "EntityLinker", "url": "/api/entitylinker" },
@@ -135,6 +137,7 @@
"label": "Other",
"items": [
{ "text": "Attributes", "url": "/api/attributes" },
+ { "text": "BaseVectors", "url": "/api/basevectors" },
{ "text": "Corpus", "url": "/api/corpus" },
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
{ "text": "KnowledgeBase", "url": "/api/kb" },
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index c8295593c..1c969bd39 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
-
- 💥 Out now: spaCy v3.6
+
+ 💥 Out now: spaCy v3.7
)
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index b6c8b9b4c..43e3a0eeb 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -10,15 +10,19 @@ const DEFAULT_PLATFORM = 'x86'
const DEFAULT_MODELS = ['en']
const DEFAULT_OPT = 'efficiency'
const DEFAULT_HARDWARE = 'cpu'
-const DEFAULT_CUDA = 'cuda-autodetect'
+const DEFAULT_CUDA = 'cuda11x'
const CUDA = {
'8.0': 'cuda80',
'9.0': 'cuda90',
- 9.1: 'cuda91',
- 9.2: 'cuda92',
+ '9.1': 'cuda91',
+ '9.2': 'cuda92',
'10.0': 'cuda100',
- 10.1: 'cuda101',
- '10.2, 11.0+': 'cuda-autodetect',
+ '10.1': 'cuda101',
+ '10.2': 'cuda102',
+ '11.0': 'cuda110',
+ '11.1': 'cuda111',
+ '11.2-11.x': 'cuda11x',
+ '12.x': 'cuda12x',
}
const LANG_EXTRAS = ['ja'] // only for languages with models