From 774c10fa39fb52ee23bd65faafc8eea0ad1f180e Mon Sep 17 00:00:00 2001 From: Simon Gurcke Date: Fri, 27 Jan 2023 20:43:40 +1000 Subject: [PATCH 01/15] Add alignment_mode argument to Span.char_span() (#12145) * Add alignment_mode argument to Span.char_span() * Update website * Update spacy/tokens/span.pyx Co-authored-by: Adriane Boyd * Add test Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_span.py | 8 ++++++++ spacy/tokens/span.pyi | 1 + spacy/tokens/span.pyx | 11 ++++++++--- website/docs/api/span.mdx | 17 +++++++++-------- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 3676b35af..d02f305f4 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -367,6 +367,14 @@ def test_spans_by_character(doc): span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" ) + # Span.char_span + alignment mode "contract" + span2 = doc[0:2].char_span( + span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract" + ) + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == "GPE" + def test_span_to_array(doc): span = doc[1:-2] diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 9986a90e6..00226098a 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -98,6 +98,7 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + alignment_mode: str = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 99a5f43bd..2912dd705 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -362,7 +362,7 @@ cdef class Span: result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) # ensure we get a scalar back (numpy does this automatically but cupy doesn't) return result.item() - + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -639,7 +639,7 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. @@ -649,11 +649,16 @@ cdef class Span: kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + alignment_mode (str): How character indices are aligned to token + boundaries. Options: "strict" (character indices must be aligned + with token boundaries), "contract" (span of all tokens completely + within the character span), "expand" (span of all tokens at least + partially covered by the character span). Defaults to "strict". RETURNS (Span): The newly constructed object. """ start_idx += self.c.start_char end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode) @property def conjuncts(self): diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index bd7794edc..a135f5ec9 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -186,14 +186,15 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {id="similarity",tag="method",model="vectors"} From 5f8a398bb9d12e65069442de28fe1b9036ff119f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 27 Jan 2023 15:09:17 +0100 Subject: [PATCH 02/15] Add span_id to Span.char_span, update Doc/Span.char_span docs (#12196) * Add span_id to Span.char_span, update Doc/Span.char_span docs `Span.char_span(id=)` should be removed in the future. * Also use Union[int, str] in Doc docstring --- spacy/tests/doc/test_span.py | 12 ++++++++++++ spacy/tokens/doc.pyi | 1 + spacy/tokens/doc.pyx | 5 +++-- spacy/tokens/span.pyi | 2 ++ spacy/tokens/span.pyx | 10 ++++++---- website/docs/api/doc.mdx | 19 ++++++++++--------- website/docs/api/span.mdx | 2 ++ 7 files changed, 36 insertions(+), 15 deletions(-) diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index d02f305f4..b4631037a 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text): assert span.text == text +def test_char_span_attributes(doc): + label = "LABEL" + kb_id = "KB_ID" + span_id = "SPAN_ID" + span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id) + span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id) + assert span1.text == span2.text + assert span1.label_ == span2.label_ == label + assert span1.kb_id_ == span2.kb_id_ == kb_id + assert span1.id_ == span2.id_ == span_id + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f0cdaee87..9d45960ab 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -108,6 +108,7 @@ class Doc: kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 075bc4d15..7dfe0ca9f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -528,9 +528,9 @@ cdef class Doc: doc (Doc): The parent document. start_idx (int): The index of the first character of the span. end_idx (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. @@ -539,6 +539,7 @@ cdef class Doc: with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/doc#char_span diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 00226098a..a92f19e20 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -98,7 +98,9 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + id: Union[int, str] = ..., alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 2912dd705..cfe1236df 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -639,26 +639,28 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + id (Union[int, str]): Unused. alignment_mode (str): How character indices are aligned to token boundaries. Options: "strict" (character indices must be aligned with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. """ start_idx += self.c.start_char end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id) @property def conjuncts(self): diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index a5f3de6be..13c59c4af 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -209,15 +209,16 @@ alignment mode `"strict". > assert span.text == "New York" > ``` -| Name | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.3.1 | An identifier to associate with the span. ~~Union[int, str]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Doc.set_ents {id="set_ents",tag="method",version="3"} diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index a135f5ec9..41422a5b4 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -193,7 +193,9 @@ the character indices don't map to a valid span. | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `id` | Unused. ~~Union[int, str]~~ | | `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.5.1 | An identifier to associate with the span. ~~Union[int, str]~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {id="similarity",tag="method",model="vectors"} From bd739e67d6e730d21a65c616917de24e148b5382 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 27 Jan 2023 15:13:20 +0100 Subject: [PATCH 03/15] explain KB change and how to remedy (#12189) --- website/docs/usage/v3-5.mdx | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx index ac61338e3..3ca64f8a2 100644 --- a/website/docs/usage/v3-5.mdx +++ b/website/docs/usage/v3-5.mdx @@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a `textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0` as explained in the [docs](/api/textcategorizer#assigned-attributes). +### Using the default knowledge base + +As `KnowledgeBase` is now an abstract class, you should call the constructor of +the new `InMemoryLookupKB` instead when you want to use spaCy's default KB +implementation: + +```diff +- kb = KnowledgeBase() ++ kb = InMemoryLookupKB() +``` + +If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to +implement its abstract methods, or alternatively inherit from `InMemoryLookupKB` +instead. + ### Updated scorers for tokenization and textcat {id="scores"} We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported From 606273f7e47678996cc2d93fe79c5b12f2de1ca5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 27 Jan 2023 16:13:34 +0100 Subject: [PATCH 04/15] Normalize whitespace in evaluate CLI output test (#12157) * Normalize whitespace in evaluate CLI output test Depending on terminal settings, lines may be padded to the screen width so the comparison is too strict with only the command string replacement. * Move to test util method * Change to normalization method --- spacy/tests/test_cli_app.py | 6 +++--- spacy/tests/util.py | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 80da5a447..40100412a 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -4,7 +4,7 @@ from typer.testing import CliRunner from spacy.tokens import DocBin, Doc from spacy.cli._util import app -from .util import make_tempdir +from .util import make_tempdir, normalize_whitespace def test_convert_auto(): @@ -38,8 +38,8 @@ def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) - assert result_benchmark.stdout == result_evaluate.stdout.replace( - "spacy evaluate", "spacy benchmark accuracy" + assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace( + result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy") ) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index d5f3c39ff..c2647558d 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,6 +1,7 @@ import numpy import tempfile import contextlib +import re import srsly from spacy.tokens import Doc from spacy.vocab import Vocab @@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2): for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): assert k1 == k2 assert v1 == v2 + + +def normalize_whitespace(s): + return re.sub(r"\s+", " ", s) From 8932f4dc350ae49b02d6caee5e524e5f48345516 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 30 Jan 2023 18:05:23 +0900 Subject: [PATCH 05/15] Add extra flag to assets docs (#12194) * Add extra flag to assets docs For some reason this wasn't included. * Add new tag to docs --- website/docs/api/cli.mdx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index f7315bb2c..bd966015e 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1410,12 +1410,13 @@ $ python -m spacy project assets [project_dir] > $ python -m spacy project assets [--sparse] > ``` -| Name | Description | -| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | +| Name | Description | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--extra`, `-e` 3.3.1 | Download assets marked as "extra". Default false. ~~bool (flag)~~ | +| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | ### project run {id="project-run",tag="command"} From 0e51c918ae2fbcaec875367e1d331e4366fdfe64 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 30 Jan 2023 17:51:27 +0100 Subject: [PATCH 06/15] Normalize whitespace in evaluate CLI output test (#12157) * Normalize whitespace in evaluate CLI output test Depending on terminal settings, lines may be padded to the screen width so the comparison is too strict with only the command string replacement. * Move to test util method * Change to normalization method From 1b5aba9e220a6081f006b08929eabc50b0be6c4b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 31 Jan 2023 19:31:17 +0900 Subject: [PATCH 07/15] Don't re-download installed models (#12188) * Don't re-download installed models When downloading a model, this checks if the same version of the same model is already installed. If it is then the download is skipped. This is necessary because pip uses the final download URL for its caching feature, but because of the way models are hosted on Github, their URLs change every few minutes. * Use importlib instead of meta.json * Use get_package_version * Add untested, disabled test --------- Co-authored-by: Adriane Boyd --- .github/azure-steps.yml | 5 +++++ spacy/cli/download.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index d0db75f9a..7c3c3e0a6 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -62,6 +62,11 @@ steps: # - script: | # python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" # displayName: 'Test no warnings on load (#11713)' +# condition: eq(variables['python_version'], '3.8') +# +# - script: | +# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping +# displayName: 'Test skip re-download (#12188)' # condition: eq(variables['python_version'], '3.8') - script: | diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 4c998a6e0..90471c55e 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -7,7 +7,8 @@ import typer from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX from .. import about from ..util import is_package, get_minor_version, run_command -from ..util import is_prerelease_version +from ..util import is_prerelease_version, get_installed_models +from ..util import get_package_version @app.command( @@ -63,6 +64,14 @@ def download( compatibility = get_compatibility() version = get_version(model_name, compatibility) + # If we already have this version installed, skip downloading + installed = get_installed_models() + if model_name in installed: + installed_version = get_package_version(model_name) + if installed_version == version: + msg.warn(f"{model_name} v{version} already installed, skipping") + return + filename = get_model_filename(model_name, version, sdist) download_model(filename, pip_args) From fb7f018ded7ec8bb517fec215d848cc6dff5b0b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 31 Jan 2023 13:06:02 +0100 Subject: [PATCH 08/15] Add the configuration schema for distillation (#12201) * Add the configuration schema for distillation This also adds the default configuration and some tests. The schema will be used by the training loop and `distill` subcommand. * Format * Change distillation shortopt to -d * Fix descripion of max_epochs * Rename distillation flag to -dt * Rename `pipe_map` to `student_to_teacher` --- spacy/cli/init_config.py | 15 ++++- spacy/default_config_distillation.cfg | 34 ++++++++++ spacy/language.py | 3 + spacy/schemas.py | 23 +++++++ .../tests/serialize/test_serialize_config.py | 65 ++++++++++++++++++- 5 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 spacy/default_config_distillation.cfg diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index b634caa4c..fbfb56aa2 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -8,7 +8,7 @@ import re from jinja2 import Template from .. import util -from ..language import DEFAULT_CONFIG_PRETRAIN_PATH +from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema from ..util import SimpleFrozenList from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND @@ -83,6 +83,7 @@ def init_fill_config_cli( # fmt: off base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False), output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True), + distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), @@ -98,13 +99,20 @@ def init_fill_config_cli( DOCS: https://spacy.io/api/cli#init-fill-config """ import_code(code_path) - fill_config(output_file, base_path, pretraining=pretraining, diff=diff) + fill_config( + output_file, + base_path, + distillation=distillation, + pretraining=pretraining, + diff=diff, + ) def fill_config( output_file: Path, base_path: Path, *, + distillation: bool = False, pretraining: bool = False, diff: bool = False, silent: bool = False, @@ -123,6 +131,9 @@ def fill_config( # replaced with their actual config after loading, so we have to re-add them sourced = util.get_sourced_components(config) filled["components"].update(sourced) + if distillation: + distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH) + filled = distillation_config.merge(filled) if pretraining: validate_config_for_pretrain(filled, msg) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) diff --git a/spacy/default_config_distillation.cfg b/spacy/default_config_distillation.cfg new file mode 100644 index 000000000..1926fafa9 --- /dev/null +++ b/spacy/default_config_distillation.cfg @@ -0,0 +1,34 @@ +[paths] +raw_text = null + +[distillation] +corpus = "corpora.distillation" +dropout = 0.1 +max_epochs = 1 +max_steps = 0 +student_to_teacher = {} + +[distillation.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 + +[distillation.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 1e-4 + +[corpora] + +[corpora.distillation] +@readers = "spacy.PlainTextCorpus.v1" +path = ${paths.raw_text} +min_length = 0 +max_length = 0 diff --git a/spacy/language.py b/spacy/language.py index 2e3c6d2a2..7b657515a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -48,6 +48,9 @@ PipeCallable = Callable[[Doc], Doc] # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) +# This is the base config for the [distillation] block and currently not included +# in the main config and only added via the 'init fill-config' command +DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg" # This is the base config for the [pretraining] block and currently not included # in the main config and only added via the 'init fill-config' command DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" diff --git a/spacy/schemas.py b/spacy/schemas.py index c8467fea8..975affc67 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -422,6 +422,27 @@ class ConfigSchemaInit(BaseModel): arbitrary_types_allowed = True +class ConfigSchemaDistillEmpty(BaseModel): + class Config: + extra = "forbid" + + +class ConfigSchemaDistill(BaseModel): + # fmt: off + batcher: Batcher = Field(..., title="Batcher for the training data") + corpus: StrictStr = Field(..., title="Path in the config to the distillation data") + dropout: StrictFloat = Field(..., title="Dropout rate") + max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for") + max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for") + optimizer: Optimizer = Field(..., title="The optimizer to use") + student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe") + # fmt: on + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp @@ -429,6 +450,7 @@ class ConfigSchema(BaseModel): components: Dict[str, Dict[str, Any]] corpora: Dict[str, Reader] initialize: ConfigSchemaInit + distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {} # type: ignore[assignment] class Config: extra = "allow" @@ -440,6 +462,7 @@ CONFIG_SCHEMAS = { "training": ConfigSchemaTraining, "pretraining": ConfigSchemaPretrain, "initialize": ConfigSchemaInit, + "distill": ConfigSchemaDistill, } diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 82f01dcc2..6eb95001a 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -6,10 +6,11 @@ import spacy from spacy.lang.de import German from spacy.lang.en import English from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH +from spacy.language import DEFAULT_CONFIG_DISTILL_PATH from spacy.language import Language from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model -from spacy.schemas import ConfigSchema, ConfigSchemaPretrain +from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain from spacy.util import load_config, load_config_from_str from spacy.util import load_model_from_config, registry @@ -66,6 +67,60 @@ factory = "tagger" width = ${components.tok2vec.model.width} """ +distill_config_string = """ +[paths] +train = null +dev = null + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} + +[training] + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 666 + +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v2" + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.width} + +[distill] +""" + + pretrain_config_string = """ [paths] train = null @@ -201,6 +256,14 @@ def test_create_nlp_from_config(): load_model_from_config(Config(bad_cfg), auto_fill=True) +def test_nlp_from_distillation_config(): + """Test that the default distillation config validates properly""" + config = Config().from_str(distill_config_string) + distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH) + filled = config.merge(distill_config) + registry.resolve(filled["distillation"], schema=ConfigSchemaDistill) + + def test_create_nlp_from_pretraining_config(): """Test that the default pretraining config validates properly""" config = Config().from_str(pretrain_config_string) From c6cca4c00a31651962361b1504e1c50124ad2e8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 31 Jan 2023 13:19:42 +0100 Subject: [PATCH 09/15] Language.distill: copy both reference and predicted (#12209) * Language.distill: copy both reference and predicted In distillation we also modify the teacher docs (e.g. in tok2vec components), so we need to copy both the reference and predicted doc. Problem caught by @shadeMe * Make new `_copy_examples` args kwonly --- spacy/language.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 7b657515a..d2b89029d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1062,7 +1062,7 @@ class Language: return losses validate_distillation_examples(examples, "Language.distill") - examples = _copy_examples(examples) + examples = _copy_examples(examples, copy_x=True, copy_y=True) if sgd is None: if self._optimizer is None: @@ -2331,13 +2331,18 @@ class DisabledPipes(list): self[:] = [] -def _copy_examples(examples: Iterable[Example]) -> List[Example]: +def _copy_examples( + examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False +) -> List[Example]: """Make a copy of a batch of examples, copying the predicted Doc as well. This is used in contexts where we need to take ownership of the examples so that they can be mutated, for instance during Language.evaluate and Language.update. """ - return [Example(eg.x.copy(), eg.y) for eg in examples] + return [ + Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y) + for eg in examples + ] def _apply_pipes( From 02af17a5c8861e4fdc9790aa197e40b7b428e7b4 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 31 Jan 2023 16:52:06 +0100 Subject: [PATCH 10/15] Remove flaky assertions. (#12210) --- spacy/tests/test_cli.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 42ffae22d..dc7ce46fe 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1017,8 +1017,6 @@ def test_local_remote_storage_pull_missing(): def test_cli_find_threshold(capsys): - thresholds = numpy.linspace(0, 1, 10) - def make_examples(nlp: Language) -> List[Example]: docs: List[Example] = [] @@ -1082,8 +1080,6 @@ def test_cli_find_threshold(capsys): scores_key="cats_macro_f", silent=True, ) - assert best_threshold != thresholds[0] - assert thresholds[0] < best_threshold < thresholds[9] assert best_score == max(res.values()) assert res[1.0] == 0.0 @@ -1091,7 +1087,7 @@ def test_cli_find_threshold(capsys): nlp, _ = init_nlp((("spancat", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - res = find_threshold( + best_threshold, best_score, res = find_threshold( model=nlp_dir, data_path=docs_dir / "docs.spacy", pipe_name="spancat", @@ -1099,10 +1095,8 @@ def test_cli_find_threshold(capsys): scores_key="spans_sc_f", silent=True, ) - assert res[0] != thresholds[0] - assert thresholds[0] < res[0] < thresholds[8] - assert res[1] >= 0.6 - assert res[2][1.0] == 0.0 + assert best_score == max(res.values()) + assert res[1.0] == 0.0 # Having multiple textcat_multilabel components should work, since the name has to be specified. nlp, _ = init_nlp((("textcat_multilabel", {}),)) From 360ccf628ab9ad97bbdadd050d760ac6493332ed Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Tue, 31 Jan 2023 17:30:43 +0100 Subject: [PATCH 11/15] Rename language codes (Icelandic, multi-language) (#12149) * Init * fix tests * Update spacy/errors.py Co-authored-by: Adriane Boyd * Fix test_blank_languages * Rename xx to mul in docs * Format _util with black * prettier formatting --------- Co-authored-by: Adriane Boyd --- spacy/cli/_util.py | 11 ++++ spacy/cli/convert.py | 6 ++- spacy/cli/init_config.py | 8 ++- spacy/cli/init_pipeline.py | 6 ++- spacy/errors.py | 1 + spacy/lang/{is => isl}/__init__.py | 2 +- spacy/lang/{is => isl}/stop_words.py | 0 spacy/lang/{xx => mul}/__init__.py | 4 +- spacy/lang/{xx => mul}/examples.py | 0 spacy/scorer.py | 2 +- spacy/tests/README.md | 2 +- spacy/tests/conftest.py | 10 ++-- spacy/tests/doc/test_doc_api.py | 2 +- spacy/tests/lang/{is => isl}/__init__.py | 0 spacy/tests/lang/{is => isl}/test_text.py | 8 +-- .../tests/lang/{is => isl}/test_tokenizer.py | 8 +-- spacy/tests/lang/{xx => mul}/__init__.py | 0 spacy/tests/lang/{xx => mul}/test_text.py | 4 +- .../tests/lang/{xx => mul}/test_tokenizer.py | 8 +-- spacy/tests/lang/test_initialize.py | 6 +-- spacy/tests/pipeline/test_span_ruler.py | 52 +++++++++---------- spacy/tests/test_language.py | 9 ++-- spacy/tests/tokenizer/test_explain.py | 1 + .../training/converters/conll_ner_to_docs.py | 4 +- spacy/training/converters/json_to_docs.py | 2 +- spacy/util.py | 8 +-- website/docs/api/scorer.mdx | 2 +- website/docs/usage/models.mdx | 12 ++--- website/meta/languages.json | 6 +-- website/src/widgets/quickstart-models.js | 2 +- 30 files changed, 104 insertions(+), 82 deletions(-) rename spacy/lang/{is => isl}/__init__.py (93%) rename spacy/lang/{is => isl}/stop_words.py (100%) rename spacy/lang/{xx => mul}/__init__.py (67%) rename spacy/lang/{xx => mul}/examples.py (100%) rename spacy/tests/lang/{is => isl}/__init__.py (100%) rename spacy/tests/lang/{is => isl}/test_text.py (85%) rename spacy/tests/lang/{is => isl}/test_tokenizer.py (72%) rename spacy/tests/lang/{xx => mul}/__init__.py (100%) rename spacy/tests/lang/{xx => mul}/test_text.py (96%) rename spacy/tests/lang/{xx => mul}/test_tokenizer.py (68%) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index eb4869666..42883f896 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -19,6 +19,7 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS +from ..errors import RENAMED_LANGUAGE_CODES from .. import about if TYPE_CHECKING: @@ -134,6 +135,16 @@ def _parse_override(value: Any) -> Any: return str(value) +def _handle_renamed_language_codes(lang: Optional[str]) -> None: + # Throw error for renamed language codes in v4 + if lang in RENAMED_LANGUAGE_CODES: + msg.fail( + title="Renamed language code", + text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.", + exits=1, + ) + + def load_project_config( path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() ) -> Dict[str, Any]: diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 68d454b3e..66f9461a9 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,7 @@ import re import sys import itertools -from ._util import app, Arg, Opt, walk_directory +from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory from ..training import docs_to_json from ..tokens import Doc, DocBin from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs @@ -112,6 +112,10 @@ def convert( input_path = Path(input_path) if not msg: msg = Printer(no_print=silent) + + # Throw error for renamed language codes in v4 + _handle_renamed_language_codes(lang) + ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] for input_loc in walk_directory(input_path, converter): diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index fbfb56aa2..40e598e5f 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -12,7 +12,7 @@ from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema from ..util import SimpleFrozenList from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND -from ._util import string_to_list, import_code +from ._util import string_to_list, import_code, _handle_renamed_language_codes ROOT = Path(__file__).parent / "templates" @@ -43,7 +43,7 @@ class InitValues: def init_config_cli( # fmt: off output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), - lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), + lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"), pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), @@ -169,6 +169,10 @@ def init_config( msg = Printer(no_print=silent) with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) + + # Throw error for renamed language codes in v4 + _handle_renamed_language_codes(lang) + # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index d53a61b8e..f279cf793 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -9,7 +9,7 @@ from .. import util from ..training.initialize import init_nlp, convert_vectors from ..language import Language from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu +from ._util import import_code, setup_gpu, _handle_renamed_language_codes @init_cli.command("vectors") @@ -31,6 +31,10 @@ def init_vectors_cli( a model with vectors. """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + + # Throw error for renamed language codes in v4 + _handle_renamed_language_codes(lang) + msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: diff --git a/spacy/errors.py b/spacy/errors.py index 5f480c16c..eadbf63d6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -962,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes): "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") +RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} # fmt: on diff --git a/spacy/lang/is/__init__.py b/spacy/lang/isl/__init__.py similarity index 93% rename from spacy/lang/is/__init__.py rename to spacy/lang/isl/__init__.py index 318363beb..16d1f7957 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/isl/__init__.py @@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults): class Icelandic(Language): - lang = "is" + lang = "isl" Defaults = IcelandicDefaults diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/isl/stop_words.py similarity index 100% rename from spacy/lang/is/stop_words.py rename to spacy/lang/isl/stop_words.py diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/mul/__init__.py similarity index 67% rename from spacy/lang/xx/__init__.py rename to spacy/lang/mul/__init__.py index aff8403ff..5170f1e86 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/mul/__init__.py @@ -3,10 +3,10 @@ from ...language import Language class MultiLanguage(Language): """Language class to be used for models that support multiple languages. - This module allows models to specify their language ID as 'xx'. + This module allows models to specify their language ID as 'mul'. """ - lang = "xx" + lang = "mul" __all__ = ["MultiLanguage"] diff --git a/spacy/lang/xx/examples.py b/spacy/lang/mul/examples.py similarity index 100% rename from spacy/lang/xx/examples.py rename to spacy/lang/mul/examples.py diff --git a/spacy/scorer.py b/spacy/scorer.py index de4f52be6..095effdcf 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -104,7 +104,7 @@ class Scorer: def __init__( self, nlp: Optional["Language"] = None, - default_lang: str = "xx", + default_lang: str = "mul", default_pipeline: Iterable[str] = DEFAULT_PIPELINE, **cfg, ) -> None: diff --git a/spacy/tests/README.md b/spacy/tests/README.md index f3c96a39e..9ac1e6d2e 100644 --- a/spacy/tests/README.md +++ b/spacy/tests/README.md @@ -86,7 +86,7 @@ These are the main fixtures that are currently available: | Fixture | Description | | ----------------------------------- | ---------------------------------------------------------------------------- | -| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. | +| `tokenizer` | Basic, language-independent tokenizer. Identical to the `mul` language class. | | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. | | `en_vocab` | Creates an instance of the English `Vocab`. | diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b9c4ef715..cc0450cab 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -83,7 +83,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str): @pytest.fixture(scope="module") def tokenizer(): - return get_lang_class("xx")().tokenizer + return get_lang_class("mul")().tokenizer @pytest.fixture(scope="session") @@ -243,8 +243,8 @@ def id_tokenizer(): @pytest.fixture(scope="session") -def is_tokenizer(): - return get_lang_class("is")().tokenizer +def isl_tokenizer(): + return get_lang_class("isl")().tokenizer @pytest.fixture(scope="session") @@ -496,8 +496,8 @@ def vi_tokenizer(): @pytest.fixture(scope="session") -def xx_tokenizer(): - return get_lang_class("xx")().tokenizer +def mul_tokenizer(): + return get_lang_class("mul")().tokenizer @pytest.fixture(scope="session") diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index f77d54493..2009a29d6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS from spacy.attrs import SENT_START, TAG from spacy.lang.en import English -from spacy.lang.xx import MultiLanguage +from spacy.lang.mul import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/isl/__init__.py similarity index 100% rename from spacy/tests/lang/is/__init__.py rename to spacy/tests/lang/isl/__init__.py diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/isl/test_text.py similarity index 85% rename from spacy/tests/lang/is/test_text.py rename to spacy/tests/lang/isl/test_text.py index 6e3654a6e..9e177485d 100644 --- a/spacy/tests/lang/is/test_text.py +++ b/spacy/tests/lang/isl/test_text.py @@ -1,7 +1,7 @@ import pytest -def test_long_text(is_tokenizer): +def test_long_text(isl_tokenizer): # Excerpt: European Convention on Human Rights text = """ hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja @@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins; """ - tokens = is_tokenizer(text) + tokens = isl_tokenizer(text) assert len(tokens) == 120 @pytest.mark.xfail -def test_ordinal_number(is_tokenizer): +def test_ordinal_number(isl_tokenizer): text = "10. desember 1948" - tokens = is_tokenizer(text) + tokens = isl_tokenizer(text) assert len(tokens) == 3 diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/isl/test_tokenizer.py similarity index 72% rename from spacy/tests/lang/is/test_tokenizer.py rename to spacy/tests/lang/isl/test_tokenizer.py index 0c05a6050..ba534aaf6 100644 --- a/spacy/tests/lang/is/test_tokenizer.py +++ b/spacy/tests/lang/isl/test_tokenizer.py @@ -1,6 +1,6 @@ import pytest -IS_BASIC_TOKENIZATION_TESTS = [ +ISL_BASIC_TOKENIZATION_TESTS = [ ( "Enginn maður skal sæta pyndingum eða ómannlegri eða " "vanvirðandi meðferð eða refsingu. ", @@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [ ] -@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS) -def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens): - tokens = is_tokenizer(text) +@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS) +def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens): + tokens = isl_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/mul/__init__.py similarity index 100% rename from spacy/tests/lang/xx/__init__.py rename to spacy/tests/lang/mul/__init__.py diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/mul/test_text.py similarity index 96% rename from spacy/tests/lang/xx/test_text.py rename to spacy/tests/lang/mul/test_text.py index 477f0ebe2..6e4262d66 100644 --- a/spacy/tests/lang/xx/test_text.py +++ b/spacy/tests/lang/mul/test_text.py @@ -1,7 +1,7 @@ import pytest -def test_long_text(xx_tokenizer): +def test_long_text(mul_tokenizer): # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi text = """ Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest. @@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu Sääʹmteʹǧǧ. """ - tokens = xx_tokenizer(text) + tokens = mul_tokenizer(text) assert len(tokens) == 179 diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/mul/test_tokenizer.py similarity index 68% rename from spacy/tests/lang/xx/test_tokenizer.py rename to spacy/tests/lang/mul/test_tokenizer.py index 15c760a6b..3d06dc11c 100644 --- a/spacy/tests/lang/xx/test_tokenizer.py +++ b/spacy/tests/lang/mul/test_tokenizer.py @@ -1,6 +1,6 @@ import pytest -XX_BASIC_TOKENIZATION_TESTS = [ +MUL_BASIC_TOKENIZATION_TESTS = [ ( "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", [ @@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [ ] -@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) -def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): - tokens = xx_tokenizer(text) +@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS) +def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens): + tokens = mul_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 36f4a75e0..98d37f832 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -7,10 +7,10 @@ from spacy.util import get_lang_class # excluded: ja, ko, th, vi, zh LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi", - "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", - "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", + "hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv", + "mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] + "tr", "tt", "uk", "ur", "yo"] # fmt: on diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py index 794815359..fe3bdd1bf 100644 --- a/spacy/tests/pipeline/test_span_ruler.py +++ b/spacy/tests/pipeline/test_span_ruler.py @@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns): def test_span_ruler_add_empty(patterns): """Test that patterns don't get added excessively.""" - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"validate": True}) ruler.add_patterns(patterns) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) @@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns): def test_span_ruler_init(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) @@ -74,7 +74,7 @@ def test_span_ruler_init(patterns): def test_span_ruler_no_patterns_warns(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") assert len(ruler) == 0 assert len(ruler.labels) == 0 @@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns(): def test_span_ruler_init_patterns(patterns): # initialize with patterns - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") assert len(ruler.labels) == 0 ruler.initialize(lambda: [], patterns=patterns) @@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns): def test_span_ruler_init_clear(patterns): """Test that initialization clears patterns.""" - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 @@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns): def test_span_ruler_clear(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 @@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns): def test_span_ruler_existing(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": False}) ruler.add_patterns(patterns) doc = nlp.make_doc("OH HELLO WORLD bye bye") @@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns): def test_span_ruler_existing_overwrite(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler.add_patterns(patterns) doc = nlp.make_doc("OH HELLO WORLD bye bye") @@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns): def test_span_ruler_serialize_bytes(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_nlp = spacy.blank("xx") + new_nlp = spacy.blank("mul") new_ruler = new_nlp.add_pipe("span_ruler") assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 @@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns): def test_span_ruler_validate(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") validated_ruler = nlp.add_pipe( "span_ruler", name="validated_span_ruler", config={"validate": True} @@ -203,14 +203,14 @@ def test_span_ruler_validate(): def test_span_ruler_properties(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler.add_patterns(patterns) assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns])) def test_span_ruler_overlapping_spans(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(overlapping_patterns) doc = ruler(nlp.make_doc("foo bar baz")) @@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns): def test_span_ruler_scorer(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(overlapping_patterns) text = "foo bar baz" @@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process): patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}] - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) @@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process): def test_span_ruler_serialize_dir(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: @@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns): def test_span_ruler_remove_basic(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) doc = ruler(nlp.make_doc("Dina went to school")) @@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns): def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) assert len(ruler.patterns) == 3 @@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): def test_span_ruler_remove_several_patterns(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) doc = ruler(nlp.make_doc("Dina founded the company ACME.")) @@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns): def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_date_patterns) doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th")) @@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): def test_span_ruler_remove_all_patterns(person_org_date_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_date_patterns) assert len(ruler.patterns) == 4 @@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns): def test_span_ruler_remove_and_add(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") patterns1 = [{"label": "DATE1", "pattern": "last time"}] ruler.add_patterns(patterns1) @@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add(): def test_span_ruler_spans_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}}, @@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns): def test_span_ruler_ents_default_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True}) ruler.add_patterns(overlapping_patterns) doc = ruler(nlp.make_doc("foo bar baz")) @@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns): def test_span_ruler_ents_overwrite_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={ @@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns): return pass_through_filter - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={ diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 89fa08ec7..f2d6d5fc0 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -664,11 +664,12 @@ def test_spacy_blank(): ("fra", "fr"), ("fre", "fr"), ("iw", "he"), + ("is", "isl"), ("mo", "ro"), - ("mul", "xx"), + ("mul", "mul"), ("no", "nb"), ("pt-BR", "pt"), - ("xx", "xx"), + ("xx", "mul"), ("zh-Hans", "zh"), ("zh-Hant", None), ("zxx", None), @@ -689,11 +690,11 @@ def test_language_matching(lang, target): ("fra", "fr"), ("fre", "fr"), ("iw", "he"), + ("is", "isl"), ("mo", "ro"), - ("mul", "xx"), + ("xx", "mul"), ("no", "nb"), ("pt-BR", "pt"), - ("xx", "xx"), ("zh-Hans", "zh"), ], ) diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 5b4eeca16..4268392dd 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -36,6 +36,7 @@ LANGUAGES = [ "hu", pytest.param("id", marks=pytest.mark.slow()), pytest.param("it", marks=pytest.mark.slow()), + pytest.param("isl", marks=pytest.mark.slow()), pytest.param("kn", marks=pytest.mark.slow()), pytest.param("lb", marks=pytest.mark.slow()), pytest.param("lt", marks=pytest.mark.slow()), diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 28b21c5f0..259f5fa8c 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -86,7 +86,7 @@ def conll_ner_to_docs( if model: nlp = load_model(model) else: - nlp = get_lang_class("xx")() + nlp = get_lang_class("mul")() for conll_doc in input_data.strip().split(doc_delimiter): conll_doc = conll_doc.strip() if not conll_doc: @@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): "Segmenting sentences with sentencizer. (Use `-b model` for " "improved parser-based sentence segmentation.)" ) - nlp = get_lang_class("xx")() + nlp = get_lang_class("mul")() sentencizer = nlp.create_pipe("sentencizer") lines = doc.strip().split("\n") words = [line.strip().split()[0] for line in lines] diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py index 4123839f2..1ff7a64e0 100644 --- a/spacy/training/converters/json_to_docs.py +++ b/spacy/training/converters/json_to_docs.py @@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations from ..example import annotations_to_doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model -from ...lang.xx import MultiLanguage +from ...lang.mul import MultiLanguage def json_to_docs(input_data, model=None, **kwargs): diff --git a/spacy/util.py b/spacy/util.py index 3956d27d7..e2ca0e6a4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -283,7 +283,7 @@ def find_matching_language(lang: str) -> Optional[str]: import spacy.lang # noqa: F401 if lang == "xx": - return "xx" + return "mul" # Find out which language modules we have possible_languages = [] @@ -301,11 +301,7 @@ def find_matching_language(lang: str) -> Optional[str]: # is labeled that way is probably trying to be distinct from 'zh' and # shouldn't automatically match. match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) - if match == "mul": - # Convert 'mul' back to spaCy's 'xx' - return "xx" - else: - return match + return match def get_lang_class(lang: str) -> Type["Language"]: diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx index 6f0c95f6f..d72018b90 100644 --- a/website/docs/api/scorer.mdx +++ b/website/docs/api/scorer.mdx @@ -30,7 +30,7 @@ Create a new `Scorer`. | Name | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ | -| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | +| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ | | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | | _keyword-only_ | | | `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx index 765805dc2..5b783002c 100644 --- a/website/docs/usage/models.mdx +++ b/website/docs/usage/models.mdx @@ -74,23 +74,23 @@ your data. > ```python > # Standard import -> from spacy.lang.xx import MultiLanguage +> from spacy.lang.mul import MultiLanguage > nlp = MultiLanguage() > > # With lazy-loading -> nlp = spacy.blank("xx") +> nlp = spacy.blank("mul") > ``` spaCy also supports pipelines trained on more than one language. This is especially useful for named entity recognition. The language ID used for -multi-language or language-neutral pipelines is `xx`. The language class, a +multi-language or language-neutral pipelines is `mul`. The language class, a generic subclass containing only the base language data, can be found in -[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). +[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul). To train a pipeline using the neutral multi-language class, you can set -`lang = "xx"` in your [training config](/usage/training#config). You can also +`lang = "mul"` in your [training config](/usage/training#config). You can also \import the `MultiLanguage` class directly, or call -[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. +[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading. ### Chinese language support {id="chinese",version="2.3"} diff --git a/website/meta/languages.json b/website/meta/languages.json index 46c0d3adb..eeb3a74b7 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -165,7 +165,7 @@ "has_examples": true }, { - "code": "is", + "code": "isl", "name": "Icelandic" }, { @@ -434,9 +434,9 @@ ] }, { - "code": "xx", + "code": "mul", "name": "Multi-language", - "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"], + "models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"], "example": "This is a sentence about Facebook." }, { diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js index b2a0a6280..4994dc226 100644 --- a/website/src/widgets/quickstart-models.js +++ b/website/src/widgets/quickstart-models.js @@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => { print([ - {code === 'xx' + {code === 'mul' ? '(ent.text, ent.label) for ent in doc.ents' : '(w.text, w.pos_) for w in doc'} ]) From 6920fb7baf6e5e28a5cf96b3babbae55466056ee Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 1 Feb 2023 17:47:56 +0900 Subject: [PATCH 12/15] Move Entity Linker v1 to spacy-legacy (#12006) * Move Entity Linker v1 component to spacy-legacy This is a follow up to #11889 that moves the component instead of removing it. In general, we never import from spacy-legacy in spaCy proper. However, to use this component, that kind of import will be necessary. I was able to test this without issues, but is this current import strategy acceptable? Or should we put the component in a registry? * Use spacy-legacy pr for CI This will need to be reverted before merging. * Add temporary step to log installed spacy-legacy version * Modify requirements.txt to trigger tests * Add comment to Python to trigger tests * TODO REVERT This is a commit with logic changes to trigger tests * Remove pipe from YAML Works locally, but possibly this is causing a quoting error or something. * Revert "TODO REVERT This is a commit with logic changes to trigger tests" This reverts commit 689fae71f31de4f54a00dd7dae0c26b19563c027. * Revert "Add comment to Python to trigger tests" This reverts commit 11840fc59886658c59aeb186a20173f5ec7c4583. * Add more logging * Try installing directly in workflow * Try explicitly uninstalling spacy-legacy first * Cat requirements.txt to confirm contents In the branch, the thinc version spec is `thinc>=8.1.0,<8.2.0`. But in the logs, it's clear that a development release of 9.0 is being installed. It's not clear why that would happen. * Log requirements at start of build * TODO REVERT Change thinc spec Want to see what happens to the installed thinc spec with this change. * Update thinc requirements This makes it the same as it was before the merge, >=8.1.0,<8.2.0. * Use same thinc version as v4 branch * TODO REVERT Mark dependency check as xfail spacy-legacy is specified as a git checkout in requirements.txt while this PR is in progress, which makes the consistency check here fail. * Remove debugging output / install step * Revert "Remove debugging output / install step" This reverts commit 923ea7448b5e819d73272bc4e43e8880a8598a07. * Clean up debugging output The manual install step with the URL fragment seems to have caused issues on Windows due to the = in the URL being misinterpreted. On the other hand, removing it seems to mean the git version of spacy-legacy isn't actually installed. This PR removes the URL fragment but keeps the direct command-line install. Additionally, since it looks like this job is configured to use the default shell (and not bash), it removes a comment that upsets the Windows cmd shell. * Revert "TODO REVERT Mark dependency check as xfail" This reverts commit d4863ec1563b7819c31a865cb94262b7dc592b7e. * Fix requirements.txt, increasing spacy-legacy version * Raise spacy legacy version in setup.cfg * Remove azure build workarounds * make spacy-legacy version explicit in error message * Remove debugging line * Suggestions from code review --- spacy/pipeline/entity_linker.py | 7 +- spacy/pipeline/legacy/__init__.py | 3 - spacy/pipeline/legacy/entity_linker.py | 422 --------------------- spacy/tests/pipeline/test_entity_linker.py | 3 +- 4 files changed, 8 insertions(+), 427 deletions(-) delete mode 100644 spacy/pipeline/legacy/__init__.py delete mode 100644 spacy/pipeline/legacy/entity_linker.py diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index fa4dea75a..6fe322b62 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -13,7 +13,6 @@ from ..kb import KnowledgeBase, Candidate from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config -from .legacy.entity_linker import EntityLinker_v1 from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab @@ -120,6 +119,12 @@ def make_entity_linker( """ if not model.attrs.get("include_span_maker", False): + try: + from spacy_legacy.components.entity_linker import EntityLinker_v1 + except: + raise ImportError( + "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." + ) # The only difference in arguments here is that use_gold_ents and threshold aren't available. return EntityLinker_v1( nlp.vocab, diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py deleted file mode 100644 index f216840dc..000000000 --- a/spacy/pipeline/legacy/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .entity_linker import EntityLinker_v1 - -__all__ = ["EntityLinker_v1"] diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py deleted file mode 100644 index c14dfa1db..000000000 --- a/spacy/pipeline/legacy/entity_linker.py +++ /dev/null @@ -1,422 +0,0 @@ -# This file is present to provide a prior version of the EntityLinker component -# for backwards compatability. For details see #9669. - -from typing import Optional, Iterable, Callable, Dict, Union, List, Any -from thinc.types import Floats2d -from pathlib import Path -from itertools import islice -import srsly -import random -from thinc.api import CosineDistance, Model, Optimizer -from thinc.api import set_dropout_rate -import warnings - -from ...kb import KnowledgeBase, Candidate -from ...ml import empty_kb -from ...tokens import Doc, Span -from ..pipe import deserialize_config -from ..trainable_pipe import TrainablePipe -from ...language import Language -from ...vocab import Vocab -from ...training import Example, validate_examples, validate_get_examples -from ...errors import Errors, Warnings -from ...util import SimpleFrozenList -from ... import util -from ...scorer import Scorer - -# See #9050 -BACKWARD_OVERWRITE = True - - -def entity_linker_score(examples, **kwargs): - return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs) - - -class EntityLinker_v1(TrainablePipe): - """Pipeline component for named entity linking. - - DOCS: https://spacy.io/api/entitylinker - """ - - NIL = "NIL" # string used to refer to a non-existing link - - def __init__( - self, - vocab: Vocab, - model: Model, - name: str = "entity_linker", - *, - labels_discard: Iterable[str], - n_sents: int, - incl_prior: bool, - incl_context: bool, - entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], - overwrite: bool = BACKWARD_OVERWRITE, - scorer: Optional[Callable] = entity_linker_score, - ) -> None: - """Initialize an entity linker. - - vocab (Vocab): The shared vocabulary. - model (thinc.api.Model): The Thinc Model powering the pipeline component. - name (str): The component instance name, used to add entries to the - losses during training. - labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. - n_sents (int): The number of neighbouring sentences to take into account. - incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. - incl_context (bool): Whether or not to include the local context in the model. - entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that - produces a list of candidates, given a certain knowledge base and a textual mention. - scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. - DOCS: https://spacy.io/api/entitylinker#init - """ - self.vocab = vocab - self.model = model - self.name = name - self.labels_discard = list(labels_discard) - self.n_sents = n_sents - self.incl_prior = incl_prior - self.incl_context = incl_context - self.get_candidates = get_candidates - self.cfg: Dict[str, Any] = {"overwrite": overwrite} - self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. - self.kb = empty_kb(entity_vector_length)(self.vocab) - self.scorer = scorer - - def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): - """Define the KB of this pipe by providing a function that will - create it using this object's vocab.""" - if not callable(kb_loader): - raise ValueError(Errors.E885.format(arg_type=type(kb_loader))) - - self.kb = kb_loader(self.vocab) - - def validate_kb(self) -> None: - # Raise an error if the knowledge base is not initialized. - if self.kb is None: - raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: - raise ValueError(Errors.E139.format(name=self.name)) - - def initialize( - self, - get_examples: Callable[[], Iterable[Example]], - *, - nlp: Optional[Language] = None, - kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None, - ): - """Initialize the pipe for training, using a representative set - of data examples. - - get_examples (Callable[[], Iterable[Example]]): Function that - returns a representative sample of gold-standard Example objects. - nlp (Language): The current nlp object the component is part of. - kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance. - Note that providing this argument, will overwrite all data accumulated in the current KB. - Use this only when loading a KB as-such from file. - - DOCS: https://spacy.io/api/entitylinker#initialize - """ - validate_get_examples(get_examples, "EntityLinker_v1.initialize") - if kb_loader is not None: - self.set_kb(kb_loader) - self.validate_kb() - nO = self.kb.entity_vector_length - doc_sample = [] - vector_sample = [] - for example in islice(get_examples(), 10): - doc_sample.append(example.x) - vector_sample.append(self.model.ops.alloc1f(nO)) - assert len(doc_sample) > 0, Errors.E923.format(name=self.name) - assert len(vector_sample) > 0, Errors.E923.format(name=self.name) - self.model.initialize( - X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") - ) - - def update( - self, - examples: Iterable[Example], - *, - drop: float = 0.0, - sgd: Optional[Optimizer] = None, - losses: Optional[Dict[str, float]] = None, - ) -> Dict[str, float]: - """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict and get_loss. - - examples (Iterable[Example]): A batch of Example objects. - drop (float): The dropout rate. - sgd (thinc.api.Optimizer): The optimizer. - losses (Dict[str, float]): Optional record of the loss during training. - Updated using the component name as the key. - RETURNS (Dict[str, float]): The updated losses dictionary. - - DOCS: https://spacy.io/api/entitylinker#update - """ - self.validate_kb() - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - if not examples: - return losses - validate_examples(examples, "EntityLinker_v1.update") - sentence_docs = [] - for eg in examples: - sentences = [s for s in eg.reference.sents] - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.reference.ents: - # KB ID of the first token is the same as the whole span - kb_id = kb_ids[ent.start] - if kb_id: - try: - # find the sentence in the list of sentences. - sent_index = sentences.index(ent.sent) - except AttributeError: - # Catch the exception when ent.sent is None and provide a user-friendly warning - raise RuntimeError(Errors.E030) from None - # get n previous sentences, if there are any - start_sentence = max(0, sent_index - self.n_sents) - # get n posterior sentences, or as many < n as there are - end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) - # get token positions - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - # append that span as a doc to training - sent_doc = eg.predicted[start_token:end_token].as_doc() - sentence_docs.append(sent_doc) - set_dropout_rate(self.model, drop) - if not sentence_docs: - warnings.warn(Warnings.W093.format(name="Entity Linker")) - return losses - sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_loss( - sentence_encodings=sentence_encodings, examples=examples - ) - bp_context(d_scores) - if sgd is not None: - self.finish_update(sgd) - losses[self.name] += loss - return losses - - def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): - validate_examples(examples, "EntityLinker_v1.get_loss") - entity_encodings = [] - for eg in examples: - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.reference.ents: - kb_id = kb_ids[ent.start] - if kb_id: - entity_encoding = self.kb.get_vector(kb_id) - entity_encodings.append(entity_encoding) - entity_encodings = self.model.ops.asarray2f(entity_encodings) - if sentence_encodings.shape != entity_encodings.shape: - err = Errors.E147.format( - method="get_loss", msg="gold entities do not match up" - ) - raise RuntimeError(err) - gradients = self.distance.get_grad(sentence_encodings, entity_encodings) - loss = self.distance.get_loss(sentence_encodings, entity_encodings) - loss = loss / len(entity_encodings) - return float(loss), gradients - - def predict(self, docs: Iterable[Doc]) -> List[str]: - """Apply the pipeline's model to a batch of docs, without modifying them. - Returns the KB IDs for each entity in each doc, including NIL if there is - no prediction. - - docs (Iterable[Doc]): The documents to predict. - RETURNS (List[str]): The models prediction for each document. - - DOCS: https://spacy.io/api/entitylinker#predict - """ - self.validate_kb() - entity_count = 0 - final_kb_ids: List[str] = [] - if not docs: - return final_kb_ids - if isinstance(docs, Doc): - docs = [docs] - for i, doc in enumerate(docs): - sentences = [s for s in doc.sents] - if len(doc) > 0: - # Looping through each entity (TODO: rewrite) - for ent in doc.ents: - sent = ent.sent - sent_index = sentences.index(sent) - assert sent_index >= 0 - # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - xp = self.model.ops.xp - if self.incl_context: - sentence_encoding = self.model.predict([sent_doc])[0] - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - entity_count += 1 - if ent.label_ in self.labels_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - else: - candidates = list(self.get_candidates(self.kb, ent)) - if not candidates: - # no prediction possible for this entity - setting to NIL - final_kb_ids.append(self.NIL) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) - else: - random.shuffle(candidates) - # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: - prior_probs = xp.asarray([0.0 for _ in candidates]) - scores = prior_probs - # add in similarity from the context - if self.incl_context: - entity_encodings = xp.asarray( - [c.entity_vector for c in candidates] - ) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError( - Errors.E147.format( - method="predict", - msg="vectors not of equal length", - ) - ) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / ( - sentence_norm * entity_norm - ) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs * sims) - best_index = scores.argmax().item() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - if not (len(final_kb_ids) == entity_count): - err = Errors.E147.format( - method="predict", msg="result variables not of equal length" - ) - raise RuntimeError(err) - return final_kb_ids - - def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None: - """Modify a batch of documents, using pre-computed scores. - - docs (Iterable[Doc]): The documents to modify. - kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. - - DOCS: https://spacy.io/api/entitylinker#set_annotations - """ - count_ents = len([ent for doc in docs for ent in doc.ents]) - if count_ents != len(kb_ids): - raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) - i = 0 - overwrite = self.cfg["overwrite"] - for doc in docs: - for ent in doc.ents: - kb_id = kb_ids[i] - i += 1 - for token in ent: - if token.ent_kb_id == 0 or overwrite: - token.ent_kb_id_ = kb_id - - def to_bytes(self, *, exclude=tuple()): - """Serialize the pipe to a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - - DOCS: https://spacy.io/api/entitylinker#to_bytes - """ - self._validate_serialization_attrs() - serialize = {} - if hasattr(self, "cfg") and self.cfg is not None: - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) - serialize["kb"] = self.kb.to_bytes - serialize["model"] = self.model.to_bytes - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, *, exclude=tuple()): - """Load the pipe from a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (TrainablePipe): The loaded object. - - DOCS: https://spacy.io/api/entitylinker#from_bytes - """ - self._validate_serialization_attrs() - - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = {} - if hasattr(self, "cfg") and self.cfg is not None: - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) - deserialize["kb"] = lambda b: self.kb.from_bytes(b) - deserialize["model"] = load_model - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> None: - """Serialize the pipe to disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - - DOCS: https://spacy.io/api/entitylinker#to_disk - """ - serialize = {} - serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) - serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["kb"] = lambda p: self.kb.to_disk(p) - serialize["model"] = lambda p: self.model.to_disk(p) - util.to_disk(path, serialize, exclude) - - def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> "EntityLinker_v1": - """Load the pipe from disk. Modifies the object in place and returns it. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (EntityLinker): The modified EntityLinker object. - - DOCS: https://spacy.io/api/entitylinker#from_disk - """ - - def load_model(p): - try: - with p.open("rb") as infile: - self.model.from_bytes(infile.read()) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize: Dict[str, Callable[[Any], Any]] = {} - deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) - deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude) - deserialize["kb"] = lambda p: self.kb.from_disk(p) - deserialize["model"] = load_model - util.from_disk(path, deserialize, exclude) - return self - - def rehearse(self, examples, *, sgd=None, losses=None, **config): - raise NotImplementedError - - def add_label(self, label): - raise NotImplementedError diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 9a8ce6653..506530591 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -12,7 +12,6 @@ from spacy.lang.en import English from spacy.ml import load_kb from spacy.ml.models.entity_linker import build_span_maker from spacy.pipeline import EntityLinker, TrainablePipe -from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir @@ -997,6 +996,8 @@ def test_scorer_links(): ) # fmt: on def test_legacy_architectures(name, config): + from spacy_legacy.components.entity_linker import EntityLinker_v1 + # Ensure that the legacy architectures still work vector_length = 3 nlp = English() From 4c60afb946f35e2675a5e21880ca3a09633d0bfa Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 1 Feb 2023 10:15:38 +0100 Subject: [PATCH 13/15] Backslash fixes in docs (#12213) * backslash fixes * revert unrelated change --- website/docs/api/doc.mdx | 2 +- website/docs/models/index.mdx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index 13c59c4af..0a5826500 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | -| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | +| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | diff --git a/website/docs/models/index.mdx b/website/docs/models/index.mdx index 371e4460f..366d44f0e 100644 --- a/website/docs/models/index.mdx +++ b/website/docs/models/index.mdx @@ -21,8 +21,8 @@ menu: ## Package naming conventions {id="conventions"} In general, spaCy expects all pipeline packages to follow the naming convention -of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name -into three components: +of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into +three components: 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with tagging, parsing, lemmatization and named entity recognition, or `dep` for From 89f974d4f54fc9c24fd2cf244ed783631f191181 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 2 Feb 2023 22:13:38 +0900 Subject: [PATCH 14/15] Cleanup/remove backwards compat overwrite settings (#11888) * Remove backwards-compatible overwrite from Entity Linker This also adds a docstring about overwrite, since it wasn't present. * Fix docstring * Remove backward compat settings in Morphologizer This also needed a docstring added. For this component it's less clear what the right overwrite settings are. * Remove backward compat from sentencizer This was simple * Remove backward compat from senter Another simple one * Remove backward compat setting from tagger * Add docstrings * Update spacy/pipeline/morphologizer.pyx Co-authored-by: Adriane Boyd * Update docs --------- Co-authored-by: Adriane Boyd --- spacy/pipeline/entity_linker.py | 8 +++----- spacy/pipeline/morphologizer.pyx | 10 ++++------ spacy/pipeline/sentencizer.pyx | 6 ++---- spacy/pipeline/senter.pyx | 5 ++--- spacy/pipeline/tagger.pyx | 6 ++---- website/docs/api/entitylinker.mdx | 2 +- website/docs/api/morphologizer.mdx | 2 +- 7 files changed, 15 insertions(+), 24 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6fe322b62..63d5cccc2 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -27,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]] KNOWLEDGE_BASE_IDS = "kb_ids" -# See #9050 -BACKWARD_OVERWRITE = True - default_model_config = """ [model] @architectures = "spacy.EntityLinker.v2" @@ -60,7 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "overwrite": True, + "overwrite": False, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, @@ -191,7 +188,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], - overwrite: bool = BACKWARD_OVERWRITE, + overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, candidates_batch_size: int, @@ -215,6 +212,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 293add9e1..fabc51fee 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -21,10 +21,6 @@ from ..scorer import Scorer from ..training import validate_examples, validate_get_examples from ..util import registry -# See #9050 -BACKWARD_OVERWRITE = True -BACKWARD_EXTEND = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -102,8 +98,8 @@ class Morphologizer(Tagger): model: Model, name: str = "morphologizer", *, - overwrite: bool = BACKWARD_OVERWRITE, - extend: bool = BACKWARD_EXTEND, + overwrite: bool = False, + extend: bool = False, scorer: Optional[Callable] = morphologizer_score, save_activations: bool = False, ): @@ -113,6 +109,8 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. + extend (bool): Whether to extend existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 77f4e8adb..6c2565170 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -10,9 +10,6 @@ from ..language import Language from ..scorer import Scorer from .. import util -# see #9050 -BACKWARD_OVERWRITE = False - @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], @@ -52,13 +49,14 @@ class Sentencizer(Pipe): name="sentencizer", *, punct_chars=None, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, ): """Initialize the sentencizer. punct_chars (list): Punctuation characters to split on. Will be serialized with the nlp object. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 42feeb277..a7d263e94 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples from ..util import registry from .. import util -# See #9050 -BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger): model, name="senter", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, save_activations: bool = False, ): @@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". save_activations (bool): save model activations in Doc when annotating. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a6be51c3c..101d8bcea 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -27,9 +27,6 @@ from .. import util ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] -# See #9050 -BACKWARD_OVERWRITE = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -99,7 +96,7 @@ class Tagger(TrainablePipe): model, name="tagger", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=tagger_score, neg_prefix="!", save_activations: bool = False, @@ -110,6 +107,7 @@ class Tagger(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". save_activations (bool): save model activations in Doc when annotating. diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 238b62a2e..12b2f6bef 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -63,7 +63,7 @@ architectures and their arguments and hyperparameters. | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index 4660ec312..9514bc773 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters. | Setting | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | | `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | From eec5ccd72f7eb6243dafb0a2e380a7d9ef9a0dbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 3 Feb 2023 15:22:25 +0100 Subject: [PATCH 15/15] `Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop --- spacy/language.py | 28 +++++--- .../pipeline/test_annotates_on_update.py | 12 +++- spacy/tests/test_language.py | 68 ++++++++++++++++++- spacy/training/loop.py | 2 +- 4 files changed, 95 insertions(+), 15 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d2b89029d..fb86689bc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1248,17 +1248,12 @@ class Language: component_cfg[name].setdefault("drop", drop) pipe_kwargs[name].setdefault("batch_size", self.batch_size) for name, proc in self.pipeline: - # ignore statements are used here because mypy ignores hasattr - if name not in exclude and hasattr(proc, "update"): - proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore - if sgd not in (None, False): - if ( - name not in exclude - and isinstance(proc, ty.TrainableComponent) - and proc.is_trainable - and proc.model not in (True, False, None) - ): - proc.finish_update(sgd) + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + ): + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) if name in annotates: for doc, eg in zip( _pipe( @@ -1271,6 +1266,17 @@ class Language: examples, ): eg.predicted = doc + # Only finish the update after all component updates are done. Some + # components may share weights (such as tok2vec) and we only want + # to apply weight updates after all gradients are accumulated. + for name, proc in self.pipeline: + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + ): + proc.finish_update(sgd) + return losses def rehearse( diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py index 869b8b874..10fb22c97 100644 --- a/spacy/tests/pipeline/test_annotates_on_update.py +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -54,9 +54,11 @@ def test_annotates_on_update(): return AssertSents(name) class AssertSents: + model = None + is_trainable = True + def __init__(self, name, **cfg): self.name = name - pass def __call__(self, doc): if not doc.has_annotation("SENT_START"): @@ -64,10 +66,16 @@ def test_annotates_on_update(): return doc def update(self, examples, *, drop=0.0, sgd=None, losses=None): + losses.setdefault(self.name, 0.0) + for example in examples: if not example.predicted.has_annotation("SENT_START"): raise ValueError("No sents") - return {} + + return losses + + def finish_update(self, sgd=None): + pass nlp = English() nlp.add_pipe("sentencizer") diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index f2d6d5fc0..3d0905dd3 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -10,8 +10,9 @@ from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German from spacy.util import registry, ignore_error, raise_error, find_matching_language +from spacy.util import load_model_from_config import spacy -from thinc.api import CupyOps, NumpyOps, get_current_ops +from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops from .util import add_vecs_to_vocab, assert_docs_equal @@ -25,6 +26,51 @@ try: except ImportError: pass +TAGGER_CFG_STRING = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +TAGGER_TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + TAGGER_TRAIN_DATA = [ ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), @@ -91,6 +137,26 @@ def test_language_update(nlp): example = Example.from_dict(doc, wrongkeyannots) +def test_language_update_updates(): + config = Config().from_str(TAGGER_CFG_STRING) + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + optimizer = nlp.initialize(get_examples=lambda: train_examples) + + docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + nlp.update(train_examples, sgd=optimizer) + docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + + xp = get_array_module(docs_after_update[0].tensor) + assert xp.any( + xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor) + ) + + def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fc929816d..fcc023a0d 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -210,7 +210,7 @@ def train_while_improving( subbatch, drop=dropout, losses=losses, - sgd=False, # type: ignore[arg-type] + sgd=None, exclude=exclude, annotates=annotating_components, )