From 2dbb332cea8cb950333d8e8eb222d8d3f6f476b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 2 Feb 2024 13:01:59 +0100 Subject: [PATCH 01/20] `TextCatParametricAttention.v1`: set key transform dimensions (#13249) * TextCatParametricAttention.v1: set key transform dimensions This is necessary for tok2vec implementations that initialize lazily (e.g. curated transformers). * Add lazily-initialized tok2vec to simulate transformers Add a lazily-initialized tok2vec to the tests and test the current textcat models with it. Fix some additional issues found using this test. * isort * Add `test.` prefix to `LazyInitTok2Vec.v1` --- spacy/ml/models/textcat.py | 15 ++++++++++- spacy/tests/pipeline/test_textcat.py | 37 ++++++++++++++++++++++++++++ spacy/tests/tok2vec.py | 36 +++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/tok2vec.py diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 3e5471ab3..601c94a7f 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -185,6 +185,11 @@ def build_text_classifier_v2( def init_ensemble_textcat(model, X, Y) -> Model: + # When tok2vec is lazily initialized, we need to initialize it before + # the rest of the chain to ensure that we can get its width. + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + tok2vec_width = get_tok2vec_width(model) model.get_ref("attention_layer").set_dim("nO", tok2vec_width) model.get_ref("maxout_layer").set_dim("nO", tok2vec_width) @@ -264,6 +269,7 @@ def _build_parametric_attention_with_residual_nonlinear( parametric_attention.set_ref("tok2vec", tok2vec) parametric_attention.set_ref("attention_layer", attention_layer) + parametric_attention.set_ref("key_transform", key_transform) parametric_attention.set_ref("nonlinear_layer", nonlinear_layer) parametric_attention.set_ref("norm_layer", norm_layer) @@ -271,10 +277,17 @@ def _build_parametric_attention_with_residual_nonlinear( def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model: + # When tok2vec is lazily initialized, we need to initialize it before + # the rest of the chain to ensure that we can get its width. + tok2vec = model.get_ref("tok2vec") + tok2vec.initialize(X) + tok2vec_width = get_tok2vec_width(model) model.get_ref("attention_layer").set_dim("nO", tok2vec_width) - model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) + model.get_ref("key_transform").set_dim("nI", tok2vec_width) + model.get_ref("key_transform").set_dim("nO", tok2vec_width) model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width) + model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width) model.get_ref("norm_layer").set_dim("nI", tok2vec_width) model.get_ref("norm_layer").set_dim("nO", tok2vec_width) init_chain(model, X, Y) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 7a78c3dac..8a0c1a976 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -28,6 +28,8 @@ from spacy.tokens import Doc, DocBin from spacy.training import Example from spacy.training.initialize import init_nlp +# Ensure that the architecture gets added to the registry. +from ..tok2vec import build_lazy_init_tok2vec as _ from ..util import make_tempdir TRAIN_DATA_SINGLE_LABEL = [ @@ -40,6 +42,13 @@ TRAIN_DATA_MULTI_LABEL = [ ("I'm confused but happy", {"cats": {"ANGRY": 0.0, "CONFUSED": 1.0, "HAPPY": 1.0}}), ] +lazy_init_model_config = """ +[model] +@architectures = "test.LazyInitTok2Vec.v1" +width = 96 +""" +LAZY_INIT_TOK2VEC_MODEL = Config().from_str(lazy_init_model_config)["model"] + def make_get_examples_single_label(nlp): train_examples = [] @@ -546,6 +555,34 @@ def test_error_with_multi_labels(): nlp.initialize(get_examples=lambda: train_examples) +# fmt: off +@pytest.mark.parametrize( + "name,textcat_config", + [ + # ENSEMBLE V2 + ("textcat_multilabel", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}), + ("textcat", {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}), + # PARAMETRIC ATTENTION V1 + ("textcat", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False}), + # REDUCE + ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), + ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": LAZY_INIT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}), + ], +) +# fmt: on +def test_tok2vec_lazy_init(name, textcat_config): + # Check that we can properly initialize and use a textcat model using + # a lazily-initialized tok2vec. + nlp = English() + pipe_config = {"model": textcat_config} + textcat = nlp.add_pipe(name, config=pipe_config) + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + nlp.initialize() + nlp.pipe(["This is a test."]) + + @pytest.mark.parametrize( "name,get_examples, train_data", [ diff --git a/spacy/tests/tok2vec.py b/spacy/tests/tok2vec.py new file mode 100644 index 000000000..7e7b689eb --- /dev/null +++ b/spacy/tests/tok2vec.py @@ -0,0 +1,36 @@ +from typing import List + +from thinc.api import Model +from thinc.types import Floats2d + +from spacy.tokens import Doc +from spacy.util import registry + + +@registry.architectures("test.LazyInitTok2Vec.v1") +def build_lazy_init_tok2vec(*, width: int) -> Model[List[Doc], List[Floats2d]]: + """tok2vec model of which the output size is only known after + initialization. This implementation does not output meaningful + embeddings, it is strictly for testing.""" + return Model( + "lazy_init_tok2vec", + lazy_init_tok2vec_forward, + init=lazy_init_tok2vec_init, + dims={"nO": None}, + attrs={"width": width}, + ) + + +def lazy_init_tok2vec_init(model: Model, X=None, Y=None): + width = model.attrs["width"] + model.set_dim("nO", width) + + +def lazy_init_tok2vec_forward(model: Model, X: List[Doc], is_train: bool): + width = model.get_dim("nO") + Y = [model.ops.alloc2f(len(doc), width) for doc in X] + + def backprop(dY): + return [] + + return Y, backprop From 40422ff9049541ae24e28aa16e8b536fc9a71381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 2 Feb 2024 13:51:26 +0100 Subject: [PATCH 02/20] Set version to 3.7.3 (#13301) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 9da0b6d74..239527aff 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.2" +__version__ = "3.7.3" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From e1249d3722765aaca56f538e830add7014d20e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 5 Feb 2024 10:07:03 +0100 Subject: [PATCH 03/20] Test if closing explicitly solves recursive lock issues (#13304) --- spacy/language.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0287549db..568d2d4fa 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1716,6 +1716,7 @@ class Language: # is done, so that they can exit gracefully. for q in texts_q: q.put(_WORK_DONE_SENTINEL) + q.close() # Otherwise, we are stopping because the error handler raised an # exception. The sentinel will be last to go out of the queue. @@ -2347,7 +2348,8 @@ def _apply_pipes( # Stop working if we encounter the end-of-work sentinel. if isinstance(texts_with_ctx, _WorkDoneSentinel): - return + sender.close() + receiver.close() docs = ( ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx @@ -2371,7 +2373,8 @@ def _apply_pipes( # Parent has closed the pipe prematurely. This happens when a # worker encounters an error and the error handler is set to # stop processing. - return + sender.close() + receiver.close() class _Sender: From 14bd9d89a3fea6a36bd0fe651ef43035f0a90d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Sun, 11 Feb 2024 19:46:43 +0100 Subject: [PATCH 04/20] Update example that shows model in requirments (#13302) See #13293. --- website/docs/usage/models.mdx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx index 3b8a5fa3f..7fed9f407 100644 --- a/website/docs/usage/models.mdx +++ b/website/docs/usage/models.mdx @@ -526,13 +526,17 @@ application's `requirements.txt`. If you're running your own internal PyPi installation, you can upload the pipeline packages there. pip's [requirements file format](https://pip.pypa.io/en/latest/reference/requirements-file-format/) supports both package names to download via a PyPi server, as well as -[direct URLs](#pipeline-urls). +[direct URLs](#pipeline-urls). For instance, you can specify the +`en_core_web_sm` model for spaCy 3.7.x as follows: ```text {title="requirements.txt"} spacy>=3.0.0,<4.0.0 -en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl +en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl ``` +See the [list of models](https://spacy.io/models) for model download links for +the current spaCy version. + All pipeline packages are versioned and specify their spaCy dependency. This ensures cross-compatibility and lets you specify exact version requirements for each pipeline. If you've [trained](/usage/training) your own pipeline, you can From fdfdbcd9f40c73eefe106f9ebf26767809d69a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 12 Feb 2024 14:39:38 +0100 Subject: [PATCH 05/20] Make `Language.pipe` workers exit cleanly (#13321) Also warn when any worker exited with a non-zero exit code and modify test to ensure that workers exit cleanly by default. --- spacy/errors.py | 1 + spacy/language.py | 5 +++++ spacy/tests/test_language.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b6108dd0f..cf9a7b708 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -220,6 +220,7 @@ class Warnings(metaclass=ErrorsWithCodes): "key attribute for vectors, configure it through Vectors(attr=) or " "'spacy init vectors --attr'") W126 = ("These keys are unsupported: {unsupported}") + W127 = ("Not all `Language.pipe` worker processes completed successfully") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index 568d2d4fa..18d20c939 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1730,6 +1730,9 @@ class Language: for proc in procs: proc.join() + if not all(proc.exitcode == 0 for proc in procs): + warnings.warn(Warnings.W127) + def _link_components(self) -> None: """Register 'listeners' within pipeline components, to allow them to effectively share weights. @@ -2350,6 +2353,7 @@ def _apply_pipes( if isinstance(texts_with_ctx, _WorkDoneSentinel): sender.close() receiver.close() + return docs = ( ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx @@ -2375,6 +2379,7 @@ def _apply_pipes( # stop processing. sender.close() receiver.close() + return class _Sender: diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 51eec3239..d229739e1 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,5 +1,6 @@ import itertools import logging +import warnings from unittest import mock import pytest @@ -738,9 +739,13 @@ def test_pass_doc_to_pipeline(nlp, n_process): assert doc.text == texts[0] assert len(doc.cats) > 0 if isinstance(get_current_ops(), NumpyOps) or n_process < 2: - docs = nlp.pipe(docs, n_process=n_process) - assert [doc.text for doc in docs] == texts - assert all(len(doc.cats) for doc in docs) + # Catch warnings to ensure that all worker processes exited + # succesfully. + with warnings.catch_warnings(): + warnings.simplefilter("error") + docs = nlp.pipe(docs, n_process=n_process) + assert [doc.text for doc in docs] == texts + assert all(len(doc.cats) for doc in docs) def test_invalid_arg_to_pipeline(nlp): From bff8725f4b4b93033bdeba6ad306e7ea79f7a402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 14 Feb 2024 14:46:28 +0100 Subject: [PATCH 06/20] Set version to 3.7.4 (#13327) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 239527aff..f5ee66dae 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.3" +__version__ = "3.7.4" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 0518c36f04864a588905394b2aeefd078a87784a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 20 Feb 2024 13:17:51 +0100 Subject: [PATCH 07/20] Sanitize direct download (#13313) The 'direct' option in 'spacy download' is supposed to only download from our model releases repository. However, users were able to pass in a relative path, allowing download from arbitrary repositories. This meant that a service that sourced strings from user input and which used the direct option would allow users to install arbitrary packages. --- spacy/cli/__init__.py | 2 ++ spacy/cli/download.py | 19 ++++++++++++++++++- spacy/tests/test_cli.py | 14 +++++++++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 1d402ff0c..3095778fe 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,5 +1,7 @@ from wasabi import msg +# Needed for testing +from . import download as download_module # noqa: F401 from ._util import app, setup_cli # noqa: F401 from .apply import apply # noqa: F401 from .assemble import assemble_cli # noqa: F401 diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 21c777f81..4261fb830 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,5 +1,6 @@ import sys from typing import Optional, Sequence +from urllib.parse import urljoin import requests import typer @@ -63,6 +64,13 @@ def download( ) pip_args = pip_args + ("--no-deps",) if direct: + # Reject model names with '/', in order to prevent shenanigans. + if "/" in model: + msg.fail( + title="Model download rejected", + text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments", + exits=True, + ) components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] @@ -153,7 +161,16 @@ def get_latest_version(model: str) -> str: def download_model( filename: str, user_pip_args: Optional[Sequence[str]] = None ) -> None: - download_url = about.__download_url__ + "/" + filename + # Construct the download URL carefully. We need to make sure we don't + # allow relative paths or other shenanigans to trick us into download + # from outside our own repo. + base_url = about.__download_url__ + # urljoin requires that the path ends with /, or the last path part will be dropped + if not base_url.endswith("/"): + base_url = about.__download_url__ + "/" + download_url = urljoin(base_url, filename) + if not download_url.startswith(about.__download_url__): + raise ValueError(f"Download from {filename} rejected. Was it a relative path?") pip_args = list(user_pip_args) if user_pip_args is not None else [] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] run_command(cmd) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index ff53ed1e1..7b729d78f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,7 +12,7 @@ from thinc.api import Config import spacy from spacy import about -from spacy.cli import info +from spacy.cli import download_module, info from spacy.cli._util import parse_config_overrides, string_to_list, walk_directory from spacy.cli.apply import apply from spacy.cli.debug_data import ( @@ -1066,3 +1066,15 @@ def test_debug_data_trainable_lemmatizer_not_annotated(): def test_project_api_imports(): from spacy.cli import project_run from spacy.cli.project.run import project_run # noqa: F401, F811 + + +def test_download_rejects_relative_urls(monkeypatch): + """Test that we can't tell spacy download to get an arbitrary model by using a + relative path in the filename""" + + monkeypatch.setattr(download_module, "run_command", lambda cmd: None) + + # Check that normal download works + download_module.download("en_core_web_sm-3.7.1", direct=True) + with pytest.raises(SystemExit): + download_module.download("../en_core_web_sm-3.7.1", direct=True) From d410d95b520e1a958f75062ab18b44b8ec8ea266 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 22 Mar 2024 18:21:20 +0100 Subject: [PATCH 08/20] remove smart_open requirement as it's taken care of via Weasel (#13391) --- requirements.txt | 1 - setup.cfg | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 036867ddc..0ad05c629 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.10.0 -smart-open>=5.2.1,<7.0.0 weasel>=0.1.0,<0.4.0 # Third party dependencies numpy>=1.15.0; python_version < "3.9" diff --git a/setup.cfg b/setup.cfg index 5e8e99f87..f9274cfae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,6 @@ install_requires = weasel>=0.1.0,<0.4.0 # Third-party dependencies typer>=0.3.0,<0.10.0 - smart-open>=5.2.1,<7.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" From 1252370f6984f977de000bd0da74508c144e20d5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 25 Mar 2024 10:17:57 +0100 Subject: [PATCH 09/20] Move DocSearch key to env var [ci skip] --- website/meta/site.json | 1 - website/next.config.mjs | 3 +++ website/src/components/search.js | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/website/meta/site.json b/website/meta/site.json index f1d318071..55fe60ad3 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -23,7 +23,6 @@ }, "docSearch": { "appId": "Y1LB128RON", - "apiKey": "bb601a1daab73e2dc66faf2b79564807", "indexName": "spacy" }, "binderUrl": "explosion/spacy-io-binder", diff --git a/website/next.config.mjs b/website/next.config.mjs index df3b1d01d..5e2f8f8c3 100644 --- a/website/next.config.mjs +++ b/website/next.config.mjs @@ -32,6 +32,9 @@ const nextConfig = withPWA( ignoreBuildErrors: true, }, images: { unoptimized: true }, + env: { + DOCSEARCH_API_KEY: process.env.DOCSEARCH_API_KEY + } }) ) diff --git a/website/src/components/search.js b/website/src/components/search.js index f80d9cd9f..3211b53c0 100644 --- a/website/src/components/search.js +++ b/website/src/components/search.js @@ -1,4 +1,4 @@ -import React, { useEffect, useState } from 'react' +import React from 'react' import PropTypes from 'prop-types' import { DocSearch } from '@docsearch/react' import '@docsearch/css' @@ -6,7 +6,8 @@ import '@docsearch/css' import siteMetadata from '../../meta/site.json' export default function Search({ placeholder = 'Search docs' }) { - const { apiKey, indexName, appId } = siteMetadata.docSearch + const apiKey = process.env.DOCSEARCH_API_KEY + const { indexName, appId } = siteMetadata.docSearch return ( ) From 4dc5fe54694ec5c9ddac8dc3710fe3d2ae657b24 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 26 Mar 2024 09:53:07 +0100 Subject: [PATCH 10/20] Renamed main branch back to v4 for now (#13395) * Update gputests.yml * Update slowtests.yml --- .github/workflows/gputests.yml | 2 +- .github/workflows/slowtests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index c6ea98f76..66e0707e0 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, main] + branch: [master, v4] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index 4a4f08005..f9fd3e817 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, main] + branch: [master, v4] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: From 21aea59001f4cf100a0c7df0c36aeddd796cee1f Mon Sep 17 00:00:00 2001 From: Yaseen <9275716+ynx0@users.noreply.github.com> Date: Tue, 26 Mar 2024 06:15:25 -0500 Subject: [PATCH 11/20] Update code.module.sass to make code title sticky (#13379) --- website/src/styles/code.module.sass | 2 ++ 1 file changed, 2 insertions(+) diff --git a/website/src/styles/code.module.sass b/website/src/styles/code.module.sass index b619c71cc..459281b43 100644 --- a/website/src/styles/code.module.sass +++ b/website/src/styles/code.module.sass @@ -109,6 +109,8 @@ box-shadow: inset 1px 1px 1px rgba(0, 0, 0, 0.25) background: var(--color-dark) margin: 1.5rem 0 0 2rem + position: sticky + left: 2rem .header width: 100% From f5e85fa05a5de357ee6a516a907042ec28f4f580 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 4 Apr 2024 12:55:08 +0200 Subject: [PATCH 12/20] allow weasel 0.4.x (#13409) --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0ad05c629..54b8f22a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 typer>=0.3.0,<0.10.0 -weasel>=0.1.0,<0.4.0 +weasel>=0.1.0,<0.5.0 # Third party dependencies numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" diff --git a/setup.cfg b/setup.cfg index f9274cfae..a6b14eb06 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,7 +53,7 @@ install_requires = wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 - weasel>=0.1.0,<0.4.0 + weasel>=0.1.0,<0.5.0 # Third-party dependencies typer>=0.3.0,<0.10.0 tqdm>=4.38.0,<5.0.0 From 2e9679769621449db4fa656483d956628cd52f96 Mon Sep 17 00:00:00 2001 From: Joe Schiff <41972063+JoeSchiff@users.noreply.github.com> Date: Tue, 16 Apr 2024 05:51:14 -0400 Subject: [PATCH 13/20] Convert properties to decorator syntax (#13390) --- spacy/lexeme.pyx | 425 ++++++++++++++++++++----------------- spacy/tokenizer.pyx | 99 +++++---- spacy/tokens/doc.pyx | 169 +++++++-------- spacy/tokens/span.pyx | 148 +++++++------ spacy/tokens/token.pyx | 332 +++++++++++++++-------------- spacy/training/example.pyx | 36 ++-- spacy/vocab.pyx | 40 ++-- 7 files changed, 665 insertions(+), 584 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f803d5e93..7a0c19bf3 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -164,45 +164,48 @@ cdef class Lexeme: vector = self.vector return numpy.sqrt((vector**2).sum()) - property vector: + @property + def vector(self): """A real-valued meaning representation. RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the lexeme's semantics. """ - def __get__(self): - cdef int length = self.vocab.vectors_length - if length == 0: - raise ValueError(Errors.E010) - return self.vocab.get_vector(self.c.orth) + cdef int length = self.vocab.vectors_length + if length == 0: + raise ValueError(Errors.E010) + return self.vocab.get_vector(self.c.orth) - def __set__(self, vector): - if len(vector) != self.vocab.vectors_length: - raise ValueError(Errors.E073.format(new_length=len(vector), - length=self.vocab.vectors_length)) - self.vocab.set_vector(self.c.orth, vector) + @vector.setter + def vector(self, vector): + if len(vector) != self.vocab.vectors_length: + raise ValueError(Errors.E073.format(new_length=len(vector), + length=self.vocab.vectors_length)) + self.vocab.set_vector(self.c.orth, vector) - property rank: + @property + def rank(self): """RETURNS (str): Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors.""" - def __get__(self): - return self.c.id + return self.c.id - def __set__(self, value): - self.c.id = value + @rank.setter + def rank(self, value): + self.c.id = value - property sentiment: + @property + def sentiment(self): """RETURNS (float): A scalar value indicating the positivity or negativity of the lexeme.""" - def __get__(self): - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) - return sentiment_table.get(self.c.orth, 0.0) + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) + return sentiment_table.get(self.c.orth, 0.0) - def __set__(self, float x): - if "lexeme_sentiment" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_sentiment") - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") - sentiment_table[self.c.orth] = x + @sentiment.setter + def sentiment(self, float x): + if "lexeme_sentiment" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_sentiment") + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") + sentiment_table[self.c.orth] = x @property def orth_(self): @@ -216,306 +219,338 @@ cdef class Lexeme: """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ - property lower: + @property + def lower(self): """RETURNS (uint64): Lowercase form of the lexeme.""" - def __get__(self): - return self.c.lower + return self.c.lower - def __set__(self, attr_t x): - self.c.lower = x + @lower.setter + def lower(self, attr_t x): + self.c.lower = x - property norm: + @property + def norm(self): """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the lexeme text. """ - def __get__(self): - return self.c.norm + return self.c.norm - def __set__(self, attr_t x): - if "lexeme_norm" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_norm") - norm_table = self.vocab.lookups.get_table("lexeme_norm") - norm_table[self.c.orth] = self.vocab.strings[x] - self.c.norm = x + @norm.setter + def norm(self, attr_t x): + if "lexeme_norm" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_norm") + norm_table = self.vocab.lookups.get_table("lexeme_norm") + norm_table[self.c.orth] = self.vocab.strings[x] + self.c.norm = x - property shape: + @property + def shape(self): """RETURNS (uint64): Transform of the word's string, to show orthographic features. """ - def __get__(self): - return self.c.shape + return self.c.shape - def __set__(self, attr_t x): - self.c.shape = x + @shape.setter + def shape(self, attr_t x): + self.c.shape = x - property prefix: + @property + def prefix(self): """RETURNS (uint64): Length-N substring from the start of the word. Defaults to `N=1`. """ - def __get__(self): - return self.c.prefix + return self.c.prefix - def __set__(self, attr_t x): - self.c.prefix = x + @prefix.setter + def prefix(self, attr_t x): + self.c.prefix = x - property suffix: + @property + def suffix(self): """RETURNS (uint64): Length-N substring from the end of the word. Defaults to `N=3`. """ - def __get__(self): - return self.c.suffix + return self.c.suffix - def __set__(self, attr_t x): - self.c.suffix = x + @suffix.setter + def suffix(self, attr_t x): + self.c.suffix = x - property cluster: + @property + def cluster(self): """RETURNS (int): Brown cluster ID.""" - def __get__(self): - cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) - return cluster_table.get(self.c.orth, 0) + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) + return cluster_table.get(self.c.orth, 0) - def __set__(self, int x): - cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) - cluster_table[self.c.orth] = x + @cluster.setter + def cluster(self, int x): + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) + cluster_table[self.c.orth] = x - property lang: + @property + def lang(self): """RETURNS (uint64): Language of the parent vocabulary.""" - def __get__(self): - return self.c.lang + return self.c.lang - def __set__(self, attr_t x): - self.c.lang = x + @lang.setter + def lang(self, attr_t x): + self.c.lang = x - property prob: + @property + def prob(self): """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" - def __get__(self): - prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) - settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) - default_oov_prob = settings_table.get("oov_prob", -20.0) - return prob_table.get(self.c.orth, default_oov_prob) + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) + default_oov_prob = settings_table.get("oov_prob", -20.0) + return prob_table.get(self.c.orth, default_oov_prob) - def __set__(self, float x): - prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) - prob_table[self.c.orth] = x + @prob.setter + def prob(self, float x): + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + prob_table[self.c.orth] = x - property lower_: + @property + def lower_(self): """RETURNS (str): Lowercase form of the word.""" - def __get__(self): - return self.vocab.strings[self.c.lower] + return self.vocab.strings[self.c.lower] - def __set__(self, str x): - self.c.lower = self.vocab.strings.add(x) + @lower_.setter + def lower_(self, str x): + self.c.lower = self.vocab.strings.add(x) - property norm_: + @property + def norm_(self): """RETURNS (str): The lexeme's norm, i.e. a normalised form of the lexeme text. """ - def __get__(self): - return self.vocab.strings[self.c.norm] + return self.vocab.strings[self.c.norm] - def __set__(self, str x): - self.norm = self.vocab.strings.add(x) + @norm_.setter + def norm_(self, str x): + self.norm = self.vocab.strings.add(x) - property shape_: + @property + def shape_(self): """RETURNS (str): Transform of the word's string, to show orthographic features. """ - def __get__(self): - return self.vocab.strings[self.c.shape] + return self.vocab.strings[self.c.shape] - def __set__(self, str x): - self.c.shape = self.vocab.strings.add(x) + @shape_.setter + def shape_(self, str x): + self.c.shape = self.vocab.strings.add(x) - property prefix_: + @property + def prefix_(self): """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ - def __get__(self): - return self.vocab.strings[self.c.prefix] + return self.vocab.strings[self.c.prefix] - def __set__(self, str x): - self.c.prefix = self.vocab.strings.add(x) + @prefix_.setter + def prefix_(self, str x): + self.c.prefix = self.vocab.strings.add(x) - property suffix_: + @property + def suffix_(self): """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ - def __get__(self): - return self.vocab.strings[self.c.suffix] + return self.vocab.strings[self.c.suffix] - def __set__(self, str x): - self.c.suffix = self.vocab.strings.add(x) + @suffix_.setter + def suffix_(self, str x): + self.c.suffix = self.vocab.strings.add(x) - property lang_: + @property + def lang_(self): """RETURNS (str): Language of the parent vocabulary.""" - def __get__(self): - return self.vocab.strings[self.c.lang] + return self.vocab.strings[self.c.lang] - def __set__(self, str x): - self.c.lang = self.vocab.strings.add(x) + @lang_.setter + def lang_(self, str x): + self.c.lang = self.vocab.strings.add(x) - property flags: + @property + def flags(self): """RETURNS (uint64): Container of the lexeme's binary flags.""" - def __get__(self): - return self.c.flags + return self.c.flags - def __set__(self, flags_t x): - self.c.flags = x + @flags.setter + def flags(self, flags_t x): + self.c.flags = x @property def is_oov(self): """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" return self.orth not in self.vocab.vectors - property is_stop: + @property + def is_stop(self): """RETURNS (bool): Whether the lexeme is a stop word.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_STOP) + return Lexeme.c_check_flag(self.c, IS_STOP) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_STOP, x) + @is_stop.setter + def is_stop(self, bint x): + Lexeme.c_set_flag(self.c, IS_STOP, x) - property is_alpha: + @property + def is_alpha(self): """RETURNS (bool): Whether the lexeme consists of alphabetic characters. Equivalent to `lexeme.text.isalpha()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_ALPHA) + return Lexeme.c_check_flag(self.c, IS_ALPHA) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_ALPHA, x) + @is_alpha.setter + def is_alpha(self, bint x): + Lexeme.c_set_flag(self.c, IS_ALPHA, x) - property is_ascii: + @property + def is_ascii(self): """RETURNS (bool): Whether the lexeme consists of ASCII characters. Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_ASCII) + return Lexeme.c_check_flag(self.c, IS_ASCII) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_ASCII, x) + @is_ascii.setter + def is_ascii(self, bint x): + Lexeme.c_set_flag(self.c, IS_ASCII, x) - property is_digit: + @property + def is_digit(self): """RETURNS (bool): Whether the lexeme consists of digits. Equivalent to `lexeme.text.isdigit()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_DIGIT) + return Lexeme.c_check_flag(self.c, IS_DIGIT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_DIGIT, x) + @is_digit.setter + def is_digit(self, bint x): + Lexeme.c_set_flag(self.c, IS_DIGIT, x) - property is_lower: + @property + def is_lower(self): """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to `lexeme.text.islower()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_LOWER) + return Lexeme.c_check_flag(self.c, IS_LOWER) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_LOWER, x) + @is_lower.setter + def is_lower(self, bint x): + Lexeme.c_set_flag(self.c, IS_LOWER, x) - property is_upper: + @property + def is_upper(self): """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to `lexeme.text.isupper()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_UPPER) + return Lexeme.c_check_flag(self.c, IS_UPPER) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_UPPER, x) + @is_upper.setter + def is_upper(self, bint x): + Lexeme.c_set_flag(self.c, IS_UPPER, x) - property is_title: + @property + def is_title(self): """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to `lexeme.text.istitle()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_TITLE) + return Lexeme.c_check_flag(self.c, IS_TITLE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_TITLE, x) + @is_title.setter + def is_title(self, bint x): + Lexeme.c_set_flag(self.c, IS_TITLE, x) - property is_punct: + @property + def is_punct(self): """RETURNS (bool): Whether the lexeme is punctuation.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_PUNCT) + return Lexeme.c_check_flag(self.c, IS_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_PUNCT, x) + @is_punct.setter + def is_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_PUNCT, x) - property is_space: + @property + def is_space(self): """RETURNS (bool): Whether the lexeme consist of whitespace characters. Equivalent to `lexeme.text.isspace()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_SPACE) + return Lexeme.c_check_flag(self.c, IS_SPACE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_SPACE, x) + @is_space.setter + def is_space(self, bint x): + Lexeme.c_set_flag(self.c, IS_SPACE, x) - property is_bracket: + @property + def is_bracket(self): """RETURNS (bool): Whether the lexeme is a bracket.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_BRACKET) + return Lexeme.c_check_flag(self.c, IS_BRACKET) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_BRACKET, x) + @is_bracket.setter + def is_bracket(self, bint x): + Lexeme.c_set_flag(self.c, IS_BRACKET, x) - property is_quote: + @property + def is_quote(self): """RETURNS (bool): Whether the lexeme is a quotation mark.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_QUOTE) + return Lexeme.c_check_flag(self.c, IS_QUOTE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_QUOTE, x) + @is_quote.setter + def is_quote(self, bint x): + Lexeme.c_set_flag(self.c, IS_QUOTE, x) - property is_left_punct: + @property + def is_left_punct(self): """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) + return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) + @is_left_punct.setter + def is_left_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) - property is_right_punct: + @property + def is_right_punct(self): """RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) + return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + @is_right_punct.setter + def is_right_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) - property is_currency: + @property + def is_currency(self): """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_CURRENCY) + return Lexeme.c_check_flag(self.c, IS_CURRENCY) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_CURRENCY, x) + @is_currency.setter + def is_currency(self, bint x): + Lexeme.c_set_flag(self.c, IS_CURRENCY, x) - property like_url: + @property + def like_url(self): """RETURNS (bool): Whether the lexeme resembles a URL.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_URL) + return Lexeme.c_check_flag(self.c, LIKE_URL) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_URL, x) + @like_url.setter + def like_url(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_URL, x) - property like_num: + @property + def like_num(self): """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", "10", "ten", etc. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_NUM) + return Lexeme.c_check_flag(self.c, LIKE_NUM) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_NUM, x) + @like_num.setter + def like_num(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_NUM, x) - property like_email: + @property + def like_email(self): """RETURNS (bool): Whether the lexeme resembles an email address.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_EMAIL) + return Lexeme.c_check_flag(self.c, LIKE_EMAIL) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) + @like_email.setter + def like_email(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 6f2b10734..96545828f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -70,65 +70,72 @@ cdef class Tokenizer: self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) - property token_match: - def __get__(self): - return self._token_match + @property + def token_match(self): + return self._token_match - def __set__(self, token_match): - self._token_match = token_match - self._reload_special_cases() + @token_match.setter + def token_match(self, token_match): + self._token_match = token_match + self._reload_special_cases() - property url_match: - def __get__(self): - return self._url_match + @property + def url_match(self): + return self._url_match - def __set__(self, url_match): - self._url_match = url_match - self._reload_special_cases() + @url_match.setter + def url_match(self, url_match): + self._url_match = url_match + self._reload_special_cases() - property prefix_search: - def __get__(self): - return self._prefix_search + @property + def prefix_search(self): + return self._prefix_search - def __set__(self, prefix_search): - self._prefix_search = prefix_search - self._reload_special_cases() + @prefix_search.setter + def prefix_search(self, prefix_search): + self._prefix_search = prefix_search + self._reload_special_cases() - property suffix_search: - def __get__(self): - return self._suffix_search + @property + def suffix_search(self): + return self._suffix_search - def __set__(self, suffix_search): - self._suffix_search = suffix_search - self._reload_special_cases() + @suffix_search.setter + def suffix_search(self, suffix_search): + self._suffix_search = suffix_search + self._reload_special_cases() - property infix_finditer: - def __get__(self): - return self._infix_finditer + @property + def infix_finditer(self): + return self._infix_finditer - def __set__(self, infix_finditer): - self._infix_finditer = infix_finditer - self._reload_special_cases() + @infix_finditer.setter + def infix_finditer(self, infix_finditer): + self._infix_finditer = infix_finditer + self._reload_special_cases() - property rules: - def __get__(self): - return self._rules + @property + def rules(self): + return self._rules - def __set__(self, rules): - self._rules = {} - self._flush_cache() - self._flush_specials() - self._cache = PreshMap() - self._specials = PreshMap() - self._load_special_cases(rules) + @rules.setter + def rules(self, rules): + self._rules = {} + self._flush_cache() + self._flush_specials() + self._cache = PreshMap() + self._specials = PreshMap() + self._load_special_cases(rules) - property faster_heuristics: - def __get__(self): - return bool(self._faster_heuristics) + @property + def faster_heuristics(self): + return bool(self._faster_heuristics) - def __set__(self, faster_heuristics): - self._faster_heuristics = bool(faster_heuristics) - self._reload_special_cases() + @faster_heuristics.setter + def faster_heuristics(self, faster_heuristics): + self._faster_heuristics = bool(faster_heuristics) + self._reload_special_cases() def __reduce__(self): args = (self.vocab, diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 181c0ce0f..4d6249569 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -667,7 +667,8 @@ cdef class Doc: else: return False - property vector: + @property + def vector(self): """A real-valued meaning representation. Defaults to an average of the token vectors. @@ -676,48 +677,49 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#vector """ - def __get__(self): - if "vector" in self.user_hooks: - return self.user_hooks["vector"](self) - if self._vector is not None: - return self._vector - xp = get_array_module(self.vocab.vectors.data) - if not len(self): - self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") - return self._vector - elif self.vocab.vectors.size > 0: - self._vector = sum(t.vector for t in self) / len(self) - return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector - else: - return xp.zeros((self.vocab.vectors_length,), dtype="float32") + if "vector" in self.user_hooks: + return self.user_hooks["vector"](self) + if self._vector is not None: + return self._vector + xp = get_array_module(self.vocab.vectors.data) + if not len(self): + self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") + return self._vector + elif self.vocab.vectors.size > 0: + self._vector = sum(t.vector for t in self) / len(self) + return self._vector + elif self.tensor.size > 0: + self._vector = self.tensor.mean(axis=0) + return self._vector + else: + return xp.zeros((self.vocab.vectors_length,), dtype="float32") - def __set__(self, value): - self._vector = value + @vector.setter + def vector(self, value): + self._vector = value - property vector_norm: + @property + def vector_norm(self): """The L2 norm of the document's vector representation. RETURNS (float): The L2 norm of the vector representation. DOCS: https://spacy.io/api/doc#vector_norm """ - def __get__(self): - if "vector_norm" in self.user_hooks: - return self.user_hooks["vector_norm"](self) - cdef float value - cdef double norm = 0 - if self._vector_norm is None: - norm = 0.0 - for value in self.vector: - norm += value * value - self._vector_norm = sqrt(norm) if norm != 0 else 0 - return self._vector_norm + if "vector_norm" in self.user_hooks: + return self.user_hooks["vector_norm"](self) + cdef float value + cdef double norm = 0 + if self._vector_norm is None: + norm = 0.0 + for value in self.vector: + norm += value * value + self._vector_norm = sqrt(norm) if norm != 0 else 0 + return self._vector_norm - def __set__(self, value): - self._vector_norm = value + @vector_norm.setter + def vector_norm(self, value): + self._vector_norm = value @property def text(self): @@ -736,7 +738,8 @@ cdef class Doc: """ return self.text - property ents: + @property + def ents(self): """The named entities in the document. Returns a tuple of named entity `Span` objects, if the entity recognizer has been applied. @@ -744,55 +747,55 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#ents """ - def __get__(self): - cdef int i - cdef const TokenC* token - cdef int start = -1 - cdef attr_t label = 0 - cdef attr_t kb_id = 0 - cdef attr_t ent_id = 0 - output = [] - for i in range(self.length): - token = &self.c[i] - if token.ent_iob == 1: - if start == -1: - seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] - raise ValueError(Errors.E093.format(seq=" ".join(seq))) - elif token.ent_iob == 2 or token.ent_iob == 0 or \ - (token.ent_iob == 3 and token.ent_type == 0): - if start != -1: - output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) - start = -1 - label = 0 - kb_id = 0 - ent_id = 0 - elif token.ent_iob == 3: - if start != -1: - output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) - start = i - label = token.ent_type - kb_id = token.ent_kb_id - ent_id = token.ent_id - if start != -1: - output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) - # remove empty-label spans - output = [o for o in output if o.label_ != ""] - return tuple(output) + cdef int i + cdef const TokenC* token + cdef int start = -1 + cdef attr_t label = 0 + cdef attr_t kb_id = 0 + cdef attr_t ent_id = 0 + output = [] + for i in range(self.length): + token = &self.c[i] + if token.ent_iob == 1: + if start == -1: + seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] + raise ValueError(Errors.E093.format(seq=" ".join(seq))) + elif token.ent_iob == 2 or token.ent_iob == 0 or \ + (token.ent_iob == 3 and token.ent_type == 0): + if start != -1: + output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) + start = -1 + label = 0 + kb_id = 0 + ent_id = 0 + elif token.ent_iob == 3: + if start != -1: + output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) + start = i + label = token.ent_type + kb_id = token.ent_kb_id + ent_id = token.ent_id + if start != -1: + output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) + # remove empty-label spans + output = [o for o in output if o.label_ != ""] + return tuple(output) - def __set__(self, ents): - # TODO: - # 1. Test basic data-driven ORTH gazetteer - # 2. Test more nuanced date and currency regex - cdef attr_t kb_id, ent_id - cdef int ent_start, ent_end - ent_spans = [] - for ent_info in ents: - entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) - if isinstance(entity_type_, str): - self.vocab.strings.add(entity_type_) - span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) - ent_spans.append(span) - self.set_ents(ent_spans, default=SetEntsDefault.outside) + @ents.setter + def ents(self, ents): + # TODO: + # 1. Test basic data-driven ORTH gazetteer + # 2. Test more nuanced date and currency regex + cdef attr_t kb_id, ent_id + cdef int ent_start, ent_end + ent_spans = [] + for ent_info in ents: + entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) + if isinstance(entity_type_, str): + self.vocab.strings.add(entity_type_) + span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) + ent_spans.append(span) + self.set_ents(ent_spans, default=SetEntsDefault.outside) def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): """Set entity annotation. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index e179bbce7..64b8d7c6c 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -757,78 +757,87 @@ cdef class Span: for word in self.rights: yield from word.subtree - property start: - def __get__(self): - return self.c.start + @property + def start(self): + return self.c.start - def __set__(self, int start): - if start < 0: - raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) - self.c.start = start + @start.setter + def start(self, int start): + if start < 0: + raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) + self.c.start = start - property end: - def __get__(self): - return self.c.end + @property + def end(self): + return self.c.end - def __set__(self, int end): - if end < 0: - raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) - self.c.end = end + @end.setter + def end(self, int end): + if end < 0: + raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) + self.c.end = end - property start_char: - def __get__(self): - return self.c.start_char + @property + def start_char(self): + return self.c.start_char - def __set__(self, int start_char): - if start_char < 0: - raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) - self.c.start_char = start_char + @start_char.setter + def start_char(self, int start_char): + if start_char < 0: + raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) + self.c.start_char = start_char - property end_char: - def __get__(self): - return self.c.end_char + @property + def end_char(self): + return self.c.end_char - def __set__(self, int end_char): - if end_char < 0: - raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) - self.c.end_char = end_char + @end_char.setter + def end_char(self, int end_char): + if end_char < 0: + raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) + self.c.end_char = end_char - property label: - def __get__(self): - return self.c.label + @property + def label(self): + return self.c.label - def __set__(self, attr_t label): - self.c.label = label + @label.setter + def label(self, attr_t label): + self.c.label = label - property kb_id: - def __get__(self): - return self.c.kb_id + @property + def kb_id(self): + return self.c.kb_id - def __set__(self, attr_t kb_id): - self.c.kb_id = kb_id + @kb_id.setter + def kb_id(self, attr_t kb_id): + self.c.kb_id = kb_id - property id: - def __get__(self): - return self.c.id + @property + def id(self): + return self.c.id - def __set__(self, attr_t id): - self.c.id = id + @id.setter + def id(self, attr_t id): + self.c.id = id - property ent_id: + @property + def ent_id(self): """RETURNS (uint64): The entity ID.""" - def __get__(self): - return self.root.ent_id + return self.root.ent_id - def __set__(self, hash_t key): - raise NotImplementedError(Errors.E200.format(attr="ent_id")) + @ent_id.setter + def ent_id(self, hash_t key): + raise NotImplementedError(Errors.E200.format(attr="ent_id")) - property ent_id_: + @property + def ent_id_(self): """RETURNS (str): The (string) entity ID.""" - def __get__(self): - return self.root.ent_id_ + return self.root.ent_id_ - def __set__(self, str key): - raise NotImplementedError(Errors.E200.format(attr="ent_id_")) + @ent_id_.setter + def ent_id_(self, str key): + raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property def orth_(self): @@ -843,29 +852,32 @@ cdef class Span: """RETURNS (str): The span's lemma.""" return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() - property label_: + @property + def label_(self): """RETURNS (str): The span's label.""" - def __get__(self): - return self.doc.vocab.strings[self.label] + return self.doc.vocab.strings[self.label] - def __set__(self, str label_): - self.label = self.doc.vocab.strings.add(label_) + @label_.setter + def label_(self, str label_): + self.label = self.doc.vocab.strings.add(label_) - property kb_id_: + @property + def kb_id_(self): """RETURNS (str): The span's KB ID.""" - def __get__(self): - return self.doc.vocab.strings[self.kb_id] + return self.doc.vocab.strings[self.kb_id] - def __set__(self, str kb_id_): - self.kb_id = self.doc.vocab.strings.add(kb_id_) + @kb_id_.setter + def kb_id_(self, str kb_id_): + self.kb_id = self.doc.vocab.strings.add(kb_id_) - property id_: + @property + def id_(self): """RETURNS (str): The span's ID.""" - def __get__(self): - return self.doc.vocab.strings[self.id] + return self.doc.vocab.strings[self.id] - def __set__(self, str id_): - self.id = self.doc.vocab.strings.add(id_) + @id_.setter + def id_(self, str id_): + self.id = self.doc.vocab.strings.add(id_) cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2ed736b70..a3efd5886 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -249,15 +249,16 @@ cdef class Token: """ return not self.c.morph == 0 - property morph: - def __get__(self): - return MorphAnalysis.from_id(self.vocab, self.c.morph) + @property + def morph(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) - def __set__(self, MorphAnalysis morph): - # Check that the morph has the same vocab - if self.vocab != morph.vocab: - raise ValueError(Errors.E1013) - self.c.morph = morph.c.key + @morph.setter + def morph(self, MorphAnalysis morph): + # Check that the morph has the same vocab + if self.vocab != morph.vocab: + raise ValueError(Errors.E1013) + self.c.morph = morph.c.key def set_morph(self, features): cdef hash_t key @@ -377,39 +378,43 @@ cdef class Token: """ return self.c.lex.suffix - property lemma: + @property + def lemma(self): """RETURNS (uint64): ID of the base form of the word, with no inflectional suffixes. """ - def __get__(self): - return self.c.lemma + return self.c.lemma - def __set__(self, attr_t lemma): - self.c.lemma = lemma + @lemma.setter + def lemma(self, attr_t lemma): + self.c.lemma = lemma - property pos: + @property + def pos(self): """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" - def __get__(self): - return self.c.pos + return self.c.pos - def __set__(self, pos): - self.c.pos = pos + @pos.setter + def pos(self, pos): + self.c.pos = pos - property tag: + @property + def tag(self): """RETURNS (uint64): ID of fine-grained part-of-speech tag.""" - def __get__(self): - return self.c.tag + return self.c.tag - def __set__(self, attr_t tag): - self.c.tag = tag + @tag.setter + def tag(self, attr_t tag): + self.c.tag = tag - property dep: + @property + def dep(self): """RETURNS (uint64): ID of syntactic dependency label.""" - def __get__(self): - return self.c.dep + return self.c.dep - def __set__(self, attr_t label): - self.c.dep = label + @dep.setter + def dep(self, attr_t label): + self.c.dep = label @property def has_vector(self): @@ -494,48 +499,51 @@ cdef class Token: return self.doc.user_token_hooks["sent"](self) return self.doc[self.i : self.i+1].sent - property sent_start: - def __get__(self): - """Deprecated: use Token.is_sent_start instead.""" - # Raising a deprecation warning here causes errors for autocomplete - # Handle broken backwards compatibility case: doc[0].sent_start - # was False. - if self.i == 0: - return False - else: - return self.c.sent_start + @property + def sent_start(self): + """Deprecated: use Token.is_sent_start instead.""" + # Raising a deprecation warning here causes errors for autocomplete + # Handle broken backwards compatibility case: doc[0].sent_start + # was False. + if self.i == 0: + return False + else: + return self.c.sent_start - def __set__(self, value): - self.is_sent_start = value + @sent_start.setter + def sent_start(self, value): + self.is_sent_start = value - property is_sent_start: + @property + def is_sent_start(self): """A boolean value indicating whether the token starts a sentence. `None` if unknown. Defaults to `True` for the first token in the `Doc`. RETURNS (bool / None): Whether the token starts a sentence. None if unknown. """ - def __get__(self): - if self.c.sent_start == 0: - return None - elif self.c.sent_start < 0: - return False - else: - return True + if self.c.sent_start == 0: + return None + elif self.c.sent_start < 0: + return False + else: + return True - def __set__(self, value): - if self.doc.has_annotation("DEP"): - raise ValueError(Errors.E043) - if value is None: - self.c.sent_start = 0 - elif value is True: - self.c.sent_start = 1 - elif value is False: - self.c.sent_start = -1 - else: - raise ValueError(Errors.E044.format(value=value)) + @is_sent_start.setter + def is_sent_start(self, value): + if self.doc.has_annotation("DEP"): + raise ValueError(Errors.E043) + if value is None: + self.c.sent_start = 0 + elif value is True: + self.c.sent_start = 1 + elif value is False: + self.c.sent_start = -1 + else: + raise ValueError(Errors.E044.format(value=value)) - property is_sent_end: + @property + def is_sent_end(self): """A boolean value indicating whether the token ends a sentence. `None` if unknown. Defaults to `True` for the last token in the `Doc`. @@ -544,18 +552,18 @@ cdef class Token: DOCS: https://spacy.io/api/token#is_sent_end """ - def __get__(self): - if self.i + 1 == len(self.doc): - return True - elif self.doc[self.i+1].is_sent_start is None: - return None - elif self.doc[self.i+1].is_sent_start is True: - return True - else: - return False + if self.i + 1 == len(self.doc): + return True + elif self.doc[self.i+1].is_sent_start is None: + return None + elif self.doc[self.i+1].is_sent_start is True: + return True + else: + return False - def __set__(self, value): - raise ValueError(Errors.E196) + @is_sent_end.setter + def is_sent_end(self, value): + raise ValueError(Errors.E196) @property def lefts(self): @@ -682,41 +690,42 @@ cdef class Token: """ return not Token.missing_head(self.c) - property head: + @property + def head(self): """The syntactic parent, or "governor", of this token. If token.has_head() is `False`, this method will return itself. RETURNS (Token): The token predicted by the parser to be the head of the current token. """ - def __get__(self): - if not self.has_head(): - return self - else: - return self.doc[self.i + self.c.head] + if not self.has_head(): + return self + else: + return self.doc[self.i + self.c.head] - def __set__(self, Token new_head): - # This function sets the head of self to new_head and updates the - # counters for left/right dependents and left/right corner for the - # new and the old head - # Check that token is from the same document - if self.doc != new_head.doc: - raise ValueError(Errors.E191) - # Do nothing if old head is new head - if self.i + self.c.head == new_head.i: - return - # Find the widest l/r_edges of the roots of the two tokens involved - # to limit the number of tokens for set_children_from_heads - cdef Token self_root, new_head_root - self_root = ([self] + list(self.ancestors))[-1] - new_head_ancestors = list(new_head.ancestors) - new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head - start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge - end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge - # Set new head - self.c.head = new_head.i - self.i - # Adjust parse properties and sentence starts - set_children_from_heads(self.doc.c, start, end + 1) + @head.setter + def head(self, Token new_head): + # This function sets the head of self to new_head and updates the + # counters for left/right dependents and left/right corner for the + # new and the old head + # Check that token is from the same document + if self.doc != new_head.doc: + raise ValueError(Errors.E191) + # Do nothing if old head is new head + if self.i + self.c.head == new_head.i: + return + # Find the widest l/r_edges of the roots of the two tokens involved + # to limit the number of tokens for set_children_from_heads + cdef Token self_root, new_head_root + self_root = ([self] + list(self.ancestors))[-1] + new_head_ancestors = list(new_head.ancestors) + new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head + start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge + end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge + # Set new head + self.c.head = new_head.i - self.i + # Adjust parse properties and sentence starts + set_children_from_heads(self.doc.c, start, end + 1) @property def conjuncts(self): @@ -744,21 +753,23 @@ cdef class Token: queue.append(child) return tuple([w for w in output if w.i != self.i]) - property ent_type: + @property + def ent_type(self): """RETURNS (uint64): Named entity type.""" - def __get__(self): - return self.c.ent_type + return self.c.ent_type - def __set__(self, ent_type): - self.c.ent_type = ent_type + @ent_type.setter + def ent_type(self, ent_type): + self.c.ent_type = ent_type - property ent_type_: + @property + def ent_type_(self): """RETURNS (str): Named entity type.""" - def __get__(self): - return self.vocab.strings[self.c.ent_type] + return self.vocab.strings[self.c.ent_type] - def __set__(self, ent_type): - self.c.ent_type = self.vocab.strings.add(ent_type) + @ent_type_.setter + def ent_type_(self, ent_type): + self.c.ent_type = self.vocab.strings.add(ent_type) @property def ent_iob(self): @@ -784,41 +795,45 @@ cdef class Token: """ return self.iob_strings()[self.c.ent_iob] - property ent_id: + @property + def ent_id(self): """RETURNS (uint64): ID of the entity the token is an instance of, if any. """ - def __get__(self): - return self.c.ent_id + return self.c.ent_id - def __set__(self, hash_t key): - self.c.ent_id = key + @ent_id.setter + def ent_id(self, hash_t key): + self.c.ent_id = key - property ent_id_: + @property + def ent_id_(self): """RETURNS (str): ID of the entity the token is an instance of, if any. """ - def __get__(self): - return self.vocab.strings[self.c.ent_id] + return self.vocab.strings[self.c.ent_id] - def __set__(self, name): - self.c.ent_id = self.vocab.strings.add(name) + @ent_id_.setter + def ent_id_(self, name): + self.c.ent_id = self.vocab.strings.add(name) - property ent_kb_id: + @property + def ent_kb_id(self): """RETURNS (uint64): Named entity KB ID.""" - def __get__(self): - return self.c.ent_kb_id + return self.c.ent_kb_id - def __set__(self, attr_t ent_kb_id): - self.c.ent_kb_id = ent_kb_id + @ent_kb_id.setter + def ent_kb_id(self, attr_t ent_kb_id): + self.c.ent_kb_id = ent_kb_id - property ent_kb_id_: + @property + def ent_kb_id_(self): """RETURNS (str): Named entity KB ID.""" - def __get__(self): - return self.vocab.strings[self.c.ent_kb_id] + return self.vocab.strings[self.c.ent_kb_id] - def __set__(self, ent_kb_id): - self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) + @ent_kb_id_.setter + def ent_kb_id_(self, ent_kb_id): + self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) @property def whitespace_(self): @@ -840,16 +855,17 @@ cdef class Token: """ return self.vocab.strings[self.c.lex.lower] - property norm_: + @property + def norm_(self): """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ - def __get__(self): - return self.vocab.strings[self.norm] + return self.vocab.strings[self.norm] - def __set__(self, str norm_): - self.c.norm = self.vocab.strings.add(norm_) + @norm_.setter + def norm_(self, str norm_): + self.c.norm = self.vocab.strings.add(norm_) @property def shape_(self): @@ -879,33 +895,36 @@ cdef class Token: """ return self.vocab.strings[self.c.lex.lang] - property lemma_: + @property + def lemma_(self): """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ - def __get__(self): - return self.vocab.strings[self.c.lemma] + return self.vocab.strings[self.c.lemma] - def __set__(self, str lemma_): - self.c.lemma = self.vocab.strings.add(lemma_) + @lemma_.setter + def lemma_(self, str lemma_): + self.c.lemma = self.vocab.strings.add(lemma_) - property pos_: + @property + def pos_(self): """RETURNS (str): Coarse-grained part-of-speech tag.""" - def __get__(self): - return parts_of_speech.NAMES[self.c.pos] + return parts_of_speech.NAMES[self.c.pos] - def __set__(self, pos_name): - if pos_name not in parts_of_speech.IDS: - raise ValueError(Errors.E1021.format(pp=pos_name)) - self.c.pos = parts_of_speech.IDS[pos_name] + @pos_.setter + def pos_(self, pos_name): + if pos_name not in parts_of_speech.IDS: + raise ValueError(Errors.E1021.format(pp=pos_name)) + self.c.pos = parts_of_speech.IDS[pos_name] - property tag_: + @property + def tag_(self): """RETURNS (str): Fine-grained part-of-speech tag.""" - def __get__(self): - return self.vocab.strings[self.c.tag] + return self.vocab.strings[self.c.tag] - def __set__(self, tag): - self.tag = self.vocab.strings.add(tag) + @tag_.setter + def tag_(self, tag): + self.tag = self.vocab.strings.add(tag) def has_dep(self): """Check whether the token has annotated dep information. @@ -915,13 +934,14 @@ cdef class Token: """ return not Token.missing_dep(self.c) - property dep_: + @property + def dep_(self): """RETURNS (str): The syntactic dependency label.""" - def __get__(self): - return self.vocab.strings[self.c.dep] + return self.vocab.strings[self.c.dep] - def __set__(self, str label): - self.c.dep = self.vocab.strings.add(label) + @dep_.setter + def dep_(self, str label): + self.c.dep = self.vocab.strings.add(label) @property def is_oov(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index abdcecf71..2c1ff34cf 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -88,23 +88,25 @@ cdef class Example: def __len__(self): return len(self.predicted) - property predicted: - def __get__(self): - return self.x + @property + def predicted(self): + return self.x - def __set__(self, doc): - self.x = doc - self._cached_alignment = None - self._cached_words_x = [t.text for t in doc] + @predicted.setter + def predicted(self, doc): + self.x = doc + self._cached_alignment = None + self._cached_words_x = [t.text for t in doc] - property reference: - def __get__(self): - return self.y + @property + def reference(self): + return self.y - def __set__(self, doc): - self.y = doc - self._cached_alignment = None - self._cached_words_y = [t.text for t in doc] + @reference.setter + def reference(self, doc): + self.y = doc + self._cached_alignment = None + self._cached_words_y = [t.text for t in doc] def copy(self): return Example( @@ -420,9 +422,9 @@ cdef class Example: seen_indices.update(indices) return output - property text: - def __get__(self): - return self.x.text + @property + def text(self): + return self.x.text def __str__(self): return str(self.to_dict()) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4004a70e0..19e6eb005 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -88,16 +88,17 @@ cdef class Vocab: self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks - property vectors: - def __get__(self): - return self._vectors + @property + def vectors(self): + return self._vectors - def __set__(self, vectors): - if hasattr(vectors, "strings"): - for s in vectors.strings: - self.strings.add(s) - self._vectors = vectors - self._vectors.strings = self.strings + @vectors.setter + def vectors(self, vectors): + if hasattr(vectors, "strings"): + for s in vectors.strings: + self.strings.add(s) + self._vectors = vectors + self._vectors.strings = self.strings @property def lang(self): @@ -464,17 +465,18 @@ cdef class Vocab: key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) return key in self.vectors - property lookups: - def __get__(self): - return self._lookups + @property + def lookups(self): + return self._lookups - def __set__(self, lookups): - self._lookups = lookups - if lookups.has_table("lexeme_norm"): - self.lex_attr_getters[NORM] = util.add_lookups( - self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), - self.lookups.get_table("lexeme_norm"), - ) + @lookups.setter + def lookups(self, lookups): + self._lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + self.lookups.get_table("lexeme_norm"), + ) def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. From 2e2334632beb0e91abc1d7820a0471a10af61489 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 16 Apr 2024 12:00:22 +0200 Subject: [PATCH 14/20] Fix use_gold_ents behaviour for EntityLinker (#13400) * fix type annotation in docs * only restore entities after loss calculation * restore entities of sample in initialization * rename overfitting function * fix EL scorer * Relax test * fix formatting * Update spacy/pipeline/entity_linker.py Co-authored-by: Raphael Mitsch * rename to _ensure_ents * further rename * allow for scorer to be None --------- Co-authored-by: Raphael Mitsch --- spacy/pipeline/entity_linker.py | 63 +++++++----- spacy/tests/pipeline/test_entity_linker.py | 107 ++++++++++++++++++++- website/docs/api/entitylinker.mdx | 2 +- 3 files changed, 145 insertions(+), 27 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a730ece1b..40a9c8a79 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -11,7 +11,6 @@ from .. import util from ..errors import Errors from ..kb import Candidate, KnowledgeBase from ..language import Language -from ..ml import empty_kb from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example, validate_examples, validate_get_examples @@ -105,7 +104,7 @@ def make_entity_linker( ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. - use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another + use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another component must provide entity annotations. candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, @@ -235,7 +234,6 @@ class EntityLinker(TrainablePipe): self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) self.kb = generate_empty_kb(self.vocab, entity_vector_length) - self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size self.threshold = threshold @@ -243,6 +241,37 @@ class EntityLinker(TrainablePipe): if candidates_batch_size < 1: raise ValueError(Errors.E1044) + def _score_with_ents_set(examples: Iterable[Example], **kwargs): + # Because of how spaCy works, we can't just score immediately, because Language.evaluate + # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline. + if not scorer: + return scorer + if not self.use_gold_ents: + return scorer(examples, **kwargs) + else: + examples = self._ensure_ents(examples) + docs = self.pipe( + (eg.predicted for eg in examples), + ) + for eg, doc in zip(examples, docs): + eg.predicted = doc + return scorer(examples, **kwargs) + + self.scorer = _score_with_ents_set + + def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]: + """If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted.""" + if not self.use_gold_ents: + return examples + + new_examples = [] + for eg in examples: + ents, _ = eg.get_aligned_ents_and_ner() + new_eg = eg.copy() + new_eg.predicted.ents = ents + new_examples.append(new_eg) + return new_examples + def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will create it using this object's vocab.""" @@ -284,11 +313,9 @@ class EntityLinker(TrainablePipe): nO = self.kb.entity_vector_length doc_sample = [] vector_sample = [] - for eg in islice(get_examples(), 10): + examples = self._ensure_ents(islice(get_examples(), 10)) + for eg in examples: doc = eg.x - if self.use_gold_ents: - ents, _ = eg.get_aligned_ents_and_ner() - doc.ents = ents doc_sample.append(doc) vector_sample.append(self.model.ops.alloc1f(nO)) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) @@ -354,31 +381,17 @@ class EntityLinker(TrainablePipe): losses.setdefault(self.name, 0.0) if not examples: return losses + examples = self._ensure_ents(examples) validate_examples(examples, "EntityLinker.update") - set_dropout_rate(self.model, drop) - docs = [eg.predicted for eg in examples] - # save to restore later - old_ents = [doc.ents for doc in docs] - - for doc, ex in zip(docs, examples): - if self.use_gold_ents: - ents, _ = ex.get_aligned_ents_and_ner() - doc.ents = ents - else: - # only keep matching ents - doc.ents = ex.get_matching_ents() - # make sure we have something to learn from, if not, short-circuit if not self.batch_has_learnable_example(examples): return losses + set_dropout_rate(self.model, drop) + docs = [eg.predicted for eg in examples] sentence_encodings, bp_context = self.model.begin_update(docs) - # now restore the ents - for doc, old in zip(docs, old_ents): - doc.ents = old - loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) @@ -386,11 +399,13 @@ class EntityLinker(TrainablePipe): if sgd is not None: self.finish_update(sgd) losses[self.name] += loss + return losses def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): validate_examples(examples, "EntityLinker.get_loss") entity_encodings = [] + # We assume that get_loss is called with gold ents set in the examples if need be eidx = 0 # indices in gold entities to keep keep_ents = [] # indices in sentence_encodings to keep diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 00771a0f0..5e50a4d28 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -717,7 +717,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] # fmt: on -def test_overfitting_IO(): +def test_overfitting_IO_gold_entities(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 @@ -744,7 +744,9 @@ def test_overfitting_IO(): return mykb # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker = nlp.add_pipe( + "entity_linker", last=True, config={"use_gold_ents": True} + ) assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings @@ -807,6 +809,107 @@ def test_overfitting_IO(): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + eval = nlp.evaluate(train_examples) + assert "nel_macro_p" in eval + assert "nel_macro_r" in eval + assert "nel_macro_f" in eval + assert "nel_micro_p" in eval + assert "nel_micro_r" in eval + assert "nel_micro_f" in eval + assert "nel_f_per_type" in eval + assert "PERSON" in eval["nel_f_per_type"] + + assert eval["nel_macro_f"] > 0 + assert eval["nel_micro_f"] > 0 + + +def test_overfitting_IO_with_ner(): + # Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly + nlp = English() + vector_length = 3 + assert "Q2146908" not in nlp.vocab.strings + + # Convert the texts to docs to make sure we have doc.ents set for the training examples + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + # Create the NER and EL components and add them to the pipeline + ner = nlp.add_pipe("ner", first=True) + entity_linker = nlp.add_pipe( + "entity_linker", last=True, config={"use_gold_ents": False} + ) + entity_linker.set_kb(create_kb) + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + optimizer = nlp.initialize() + + # train the NER and NEL pipes + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.001 + assert losses["entity_linker"] < 0.001 + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # test the trained model + test_text = "Russ Cochran captured his first major title with his son as caddie." + doc = nlp(test_text) + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "Russ Cochran" + assert ents[0].label_ == "PERSON" + assert ents[0].kb_id_ != "NIL" + + # TODO: below assert is still flaky - EL doesn't properly overfit quite yet + # assert ents[0].kb_id_ == "Q2146908" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + assert nlp2.pipe_names == nlp.pipe_names + doc2 = nlp2(test_text) + ents2 = doc2.ents + assert len(ents2) == 1 + assert ents2[0].text == "Russ Cochran" + assert ents2[0].label_ == "PERSON" + assert ents2[0].kb_id_ != "NIL" + + eval = nlp.evaluate(train_examples) + assert "nel_macro_f" in eval + assert "nel_micro_f" in eval + assert "ents_f" in eval + assert "nel_f_per_type" in eval + assert "ents_per_type" in eval + assert "PERSON" in eval["nel_f_per_type"] + assert "PERSON" in eval["ents_per_type"] + + assert eval["nel_macro_f"] > 0 + assert eval["nel_micro_f"] > 0 + assert eval["ents_f"] > 0 + def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 21d2e9015..c7b11985a 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -61,7 +61,7 @@ architectures and their arguments and hyperparameters. | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | | `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | From 6d6c10ab9c2ff1059fdb062c4421a2ddd6c40c04 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 29 Apr 2024 10:18:07 +0200 Subject: [PATCH 15/20] Fix CI (#13469) * Remove hardcoded architecture setting * update classifiers to include Python 3.12 --- .github/workflows/tests.yml | 2 -- .github/workflows/universe_validation.yml | 1 - setup.cfg | 1 + 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 840b8e5f9..2a236b6bd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -31,7 +31,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.7" - architecture: x64 - name: black run: | @@ -81,7 +80,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - architecture: x64 - name: Install dependencies run: | diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index a1e3253a9..4d492500c 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -26,7 +26,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.7" - architecture: x64 - name: Validate website/meta/universe.json run: | diff --git a/setup.cfg b/setup.cfg index a6b14eb06..899e808cb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ classifiers = Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases From 74836524e3372a158ecc42ba49b10a0baad975d4 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 29 Apr 2024 10:36:31 +0200 Subject: [PATCH 16/20] Bump to v5 (#13470) --- .github/workflows/lock.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 6c3985a93..2bbdd64c7 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -16,7 +16,7 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: dessant/lock-threads@v4 + - uses: dessant/lock-threads@v5 with: process-only: 'issues' issue-inactive-days: '30' From 045cd43c3f8a2c2529393b464085809e995b6e8f Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Mon, 29 Apr 2024 11:10:17 +0200 Subject: [PATCH 17/20] Fix typos in docs (#13466) * fix typos * prettier formatting --------- Co-authored-by: svlandeg --- spacy/cli/find_threshold.py | 4 +- spacy/tests/test_language.py | 2 +- website/docs/api/attributes.mdx | 60 ++++++++++----------- website/docs/api/cli.mdx | 4 +- website/docs/api/entitylinker.mdx | 32 +++++------ website/docs/api/entityruler.mdx | 6 +-- website/docs/api/span.mdx | 2 +- website/docs/api/transformer.mdx | 2 +- website/docs/usage/layers-architectures.mdx | 2 +- website/docs/usage/linguistic-features.mdx | 2 +- website/docs/usage/projects.mdx | 4 +- website/docs/usage/saving-loading.mdx | 11 ++-- website/docs/usage/v2-2.mdx | 2 +- website/docs/usage/v3-2.mdx | 2 +- 14 files changed, 69 insertions(+), 66 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 48077fa51..3e86495e7 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,7 +39,7 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for a trained model with varying tresholds to maximize + Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` @@ -81,7 +81,7 @@ def find_threshold( silent: bool = True, ) -> Tuple[float, float, Dict[float, float]]: """ - Runs prediction trials for models with varying tresholds to maximize the specified metric. + Runs prediction trials for models with varying thresholds to maximize the specified metric. model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index d229739e1..ee707f793 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process): nlp.set_error_handler(raise_error) with pytest.raises(ValueError): list(nlp.pipe(texts, n_process=n_process)) - # set explicitely to ignoring + # set explicitly to ignoring nlp.set_error_handler(ignore_error) docs = list(nlp.pipe(texts, n_process=n_process)) assert len(docs) == 0 diff --git a/website/docs/api/attributes.mdx b/website/docs/api/attributes.mdx index 3142b741d..9cb76ac58 100644 --- a/website/docs/api/attributes.mdx +++ b/website/docs/api/attributes.mdx @@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by appending `_` as in `token.dep_`. -| Attribute | Description | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `DEP` | The token's dependency label. ~~str~~ | -| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | -| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | -| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | -| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | -| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | -| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | -| `IS_PUNCT` | Token is punctuation. ~~bool~~ | -| `IS_SPACE` | Token is whitespace. ~~bool~~ | -| `IS_STOP` | Token is a stop word. ~~bool~~ | -| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | -| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | -| `LEMMA` | The token's lemma. ~~str~~ | -| `LENGTH` | The length of the token text. ~~int~~ | -| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | -| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | -| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | -| `NORM` | The normalized form of the token text. ~~str~~ | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `POS` | The token's universal part of speech (UPOS). ~~str~~ | -| `SENT_START` | Token is start of sentence. ~~bool~~ | -| `SHAPE` | The token's shape. ~~str~~ | -| `SPACY` | Token has a trailing space. ~~bool~~ | -| `TAG` | The token's fine-grained part of speech. ~~str~~ | +| Attribute | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `DEP` | The token's dependency label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | +| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | +| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | +| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | +| `IS_PUNCT` | Token is punctuation. ~~bool~~ | +| `IS_SPACE` | Token is whitespace. ~~bool~~ | +| `IS_STOP` | Token is a stop word. ~~bool~~ | +| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | +| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | +| `LEMMA` | The token's lemma. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | +| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | +| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `POS` | The token's universal part of speech (UPOS). ~~str~~ | +| `SENT_START` | Token is start of sentence. ~~bool~~ | +| `SHAPE` | The token's shape. ~~str~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `TAG` | The token's fine-grained part of speech. ~~str~~ | diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 950d98c1f..6c47c8f1e 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== @@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] ## find-threshold {id="find-threshold",version="3.5",tag="command"} -Runs prediction trials for a trained model with varying tresholds to maximize +Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index c7b11985a..f4b83d88b 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -67,7 +67,7 @@ architectures and their arguments and hyperparameters. | `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -100,21 +100,21 @@ custom knowledge base, you should either call [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the [`initialize`](/api/entitylinker#initialize) call. -| Name | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ## EntityLinker.\_\_call\_\_ {id="call",tag="method"} diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx index 27624398e..335e87676 100644 --- a/website/docs/api/entityruler.mdx +++ b/website/docs/api/entityruler.mdx @@ -58,7 +58,7 @@ how the component should be configured. You can override its settings via the | Setting | Description | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | @@ -92,7 +92,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | | _keyword-only_ | | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | @@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap with the matches. When matches overlap in a Doc, the entity ruler prioritizes -longer patterns over shorter, and if equal the match occuring first in the Doc +longer patterns over shorter, and if equal the match occurring first in the Doc is chosen. > #### Example diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 41422a5b4..225ff6e6a 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) -has not been implemeted for the given language, a `NotImplementedError` is +has not been implemented for the given language, a `NotImplementedError` is raised. > #### Example diff --git a/website/docs/api/transformer.mdx b/website/docs/api/transformer.mdx index 8f024553d..9dcafb557 100644 --- a/website/docs/api/transformer.mdx +++ b/website/docs/api/transformer.mdx @@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `width` | The width of the last hidden layer. ~~int~~ | -### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"} +### TransformerData.empty {id="transformerdata-empty",tag="classmethod"} Create an empty `TransformerData` container. diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx index 03b85f5af..344c66e8d 100644 --- a/website/docs/usage/layers-architectures.mdx +++ b/website/docs/usage/layers-architectures.mdx @@ -832,7 +832,7 @@ retrieve and add to them. After creation, the component needs to be [initialized](/usage/training#initialization). This method can define the -relevant labels in two ways: explicitely by setting the `labels` argument in the +relevant labels in two ways: explicitly by setting the `labels` argument in the [`initialize` block](/api/data-formats#config-initialize) of the config, or implicately by deducing them from the `get_examples` callback that generates the full **training data set**, or a representative sample. diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx index 21cedd1ef..6ca970407 100644 --- a/website/docs/usage/linguistic-features.mdx +++ b/website/docs/usage/linguistic-features.mdx @@ -1899,7 +1899,7 @@ the two words. "Shore": ("coast", 0.732257), "Precautionary": ("caution", 0.490973), "hopelessness": ("sadness", 0.742366), - "Continous": ("continuous", 0.732549), + "Continuous": ("continuous", 0.732549), "Disemboweled": ("corpse", 0.499432), "biostatistician": ("scientist", 0.339724), "somewheres": ("somewheres", 0.402736), diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index b089a7ab5..e10ba4c50 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the dependency check, set `check_requirements: false` in your project's `project.yml`. -### 4. Run a workflow {id="run-workfow"} +### 4. Run a workflow {id="run-workflow"} > #### project.yml > @@ -286,7 +286,7 @@ pipelines. | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index 9a6791d5e..0b0b759e9 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -306,7 +306,9 @@ installed in the same environment – that's it. ### Loading probability tables into existing models -You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`. +You can load a probability table from +[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an +existing spaCy model like `en_core_web_sm`. ```python # Requirements: pip install spacy-lookups-data @@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"]) nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob")) ``` -When training a model from scratch you can also specify probability tables in the `config.cfg`. +When training a model from scratch you can also specify probability tables in +the `config.cfg`. ```ini {title="config.cfg (excerpt)"} [initialize.lookups] @@ -346,8 +349,8 @@ them**! To stick with the theme of [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), consider the following custom spaCy -[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a -snake when it's called: +[pipeline component](/usage/processing-pipelines#custom-components) that prints +a snake when it's called: > #### Package directory structure > diff --git a/website/docs/usage/v2-2.mdx b/website/docs/usage/v2-2.mdx index 84129657d..cf4f7c5bf 100644 --- a/website/docs/usage/v2-2.mdx +++ b/website/docs/usage/v2-2.mdx @@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== diff --git a/website/docs/usage/v3-2.mdx b/website/docs/usage/v3-2.mdx index b4a4ef672..b3ffd5d68 100644 --- a/website/docs/usage/v3-2.mdx +++ b/website/docs/usage/v3-2.mdx @@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details in the [transformer API docs](/api/architectures#TransformerModel). -`spacy-transfomers` v1.1 also adds support for `transformer_config` settings +`spacy-transformers` v1.1 also adds support for `transformer_config` settings such as `output_attentions`. Additional output is stored under `TransformerData.model_output`. More details are in the [TransformerModel docs](/api/architectures#TransformerModel). The training speed From ecd85d2618dcedb524bc457854ab3cd8e5979f20 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 29 Apr 2024 13:28:46 +0200 Subject: [PATCH 18/20] Update Typer pin and GH actions (#13471) * update gh actions * pin typer upperbound to 1.0.0 --- .github/workflows/explosionbot.yml | 2 +- .github/workflows/slowtests.yml | 2 +- .github/workflows/spacy_universe_alert.yml | 2 +- .github/workflows/tests.yml | 4 ++-- .github/workflows/universe_validation.yml | 2 +- requirements.txt | 2 +- setup.cfg | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 910cfdc40..78a27cfa3 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -15,7 +15,7 @@ jobs: env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 - name: Install and run explosion-bot run: | diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index f9fd3e817..17d8989fa 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: ${{ matrix.branch }} - name: Get commits from past 24 hours diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 33851fbcc..01731ffe0 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -18,7 +18,7 @@ jobs: run: | echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: '3.10' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2a236b6bd..af115e817 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 @@ -74,7 +74,7 @@ jobs: steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index 4d492500c..ce7df49db 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 diff --git a/requirements.txt b/requirements.txt index 54b8f22a1..2ad92176d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer>=0.3.0,<0.10.0 +typer>=0.3.0,<1.0.0 weasel>=0.1.0,<0.5.0 # Third party dependencies numpy>=1.15.0; python_version < "3.9" diff --git a/setup.cfg b/setup.cfg index 899e808cb..ca8f64548 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,7 @@ install_requires = catalogue>=2.0.6,<2.1.0 weasel>=0.1.0,<0.5.0 # Third-party dependencies - typer>=0.3.0,<0.10.0 + typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" From d3a232f773046771adc4cdfaf40343aff5872f4c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 30 Apr 2024 09:17:59 +0200 Subject: [PATCH 19/20] Update LICENSE to include 2024 (#13472) --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 979f5ade7..6cb7810c6 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From c195ca4f9ce98812eb7febc3043e212492ffc07a Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 2 May 2024 16:46:41 +0200 Subject: [PATCH 20/20] fix docs for MorphAnalysis.__contains__ (#13433) --- website/docs/api/morphology.mdx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 018ce2524..7f6802034 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis. > assert "Feat1=Val1" in morph > ``` -| Name | Description | -| ----------- | --------------------------------------------- | -| **RETURNS** | A feature/value pair in the analysis. ~~str~~ | +| Name | Description | +| ------------ | --------------------------------------------------------------------- | +| `feature` | A feature/value pair. ~~str~~ | +| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ | ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}