From 6aa6b86d496c8d9271f42c077a79f9bfb88687ac Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 1 Mar 2023 16:02:55 +0100 Subject: [PATCH 1/8] Make generation of empty `KnowledgeBase` instances configurable in `EntityLinker` (#12320) * Make empty_kb() configurable. * Format. * Update docs. * Be more specific in KB serialization test. * Update KB serialization tests. Update docs. * Remove doc update for batched candidate generation. * Fix serialization of subclassed KB in tests. * Format. * Update docstring. * Update docstring. * Switch from pickle to json for custom field serialization. --- spacy/ml/models/entity_linker.py | 8 +++ spacy/pipeline/entity_linker.py | 11 +++- spacy/tests/serialize/test_serialize_kb.py | 71 +++++++++++++++++++--- website/docs/api/architectures.mdx | 10 ++- website/docs/api/entitylinker.mdx | 28 +++++---- 5 files changed, 101 insertions(+), 27 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..7332ca199 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a11964117..f2dae0529 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, @@ -80,6 +81,7 @@ def make_entity_linker( get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -101,6 +103,7 @@ def make_entity_linker( get_candidates_batch ( Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -135,6 +138,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..f9d2e226b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 966b5830a..268c04a07 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index bafb2f2da..d84dd3ca9 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py From 0bbc620dd80007ac22d8bf1c9f6202eebc748596 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 14:48:57 +0100 Subject: [PATCH 2/8] Partially work around pending deprecation of pkg_resources (#12368) * Handle deprecation of pkg_resources * Replace `pkg_resources` with `importlib_metadata` for `spacy info --url` * Remove requirements check from `spacy project` given the lack of alternatives * Fix installed model URL method and CI test * Fix types/handling, simplify catch-all return * Move imports instead of disabling requirements check * Format * Reenable test with ignored deprecation warning * Fix except * Fix return --- .github/azure-steps.yml | 5 +++++ spacy/cli/info.py | 17 ++++++++--------- spacy/cli/project/run.py | 2 +- spacy/tests/test_cli.py | 4 +++- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index ed69f611b..b2ccf3d81 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -59,6 +59,11 @@ steps: displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') + - script: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + displayName: 'Test download_url in info CLI' + condition: eq(variables['python_version'], '3.8') + - script: | python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" displayName: 'Test no warnings on load (#11713)' diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..d82bf3fbc 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer @@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version from .. import util from .. import about +from ..compat import importlib_metadata @app.command("info") @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib_metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index dc7ce46fe..752750d33 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") From 5ecb3babed28cf8d39da9943ae24186e2bef6133 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 17:30:17 +0100 Subject: [PATCH 3/8] Update to use absolute imports in tests (#12372) --- spacy/tests/parser/test_ner.py | 4 ++-- spacy/tests/parser/test_parse.py | 6 +++--- spacy/tests/test_cli.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 00889efdc..030182a63 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -9,6 +9,8 @@ from spacy.lang.en import English from spacy.lang.it import Italian from spacy.language import Language from spacy.lookups import Lookups +from spacy.pipeline import EntityRecognizer +from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example, iob_to_biluo, split_bilu_label from spacy.tokens import Doc, Span @@ -16,8 +18,6 @@ from spacy.vocab import Vocab import logging from ..util import make_tempdir -from ...pipeline import EntityRecognizer -from ...pipeline.ner import DEFAULT_NER_MODEL TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index aaf31ed56..4b05c6721 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -8,11 +8,11 @@ from spacy.lang.en import English from spacy.tokens import Doc from spacy.training import Example from spacy.vocab import Vocab +from spacy.pipeline import DependencyParser +from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL -from ...pipeline import DependencyParser -from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ..util import apply_transition_sequence, make_tempdir -from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL TRAIN_DATA = [ ( diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 752750d33..f5bcdfd23 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -28,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.init_pipeline import _init_labels from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.project.remote_storage import RemoteStorage @@ -46,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config -from ..cli.init_pipeline import _init_labels from .util import make_tempdir From 260cb9c6febc5c7c2c17686d4f4e62c13b6833b9 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 18:06:31 +0100 Subject: [PATCH 4/8] Raise error for non-default vectors with PretrainVectors (#12366) --- spacy/errors.py | 2 + spacy/ml/models/multi_task.py | 3 ++ spacy/tests/training/test_pretraining.py | 47 +++++++++++++++++++----- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2c8b98aad..1047ed21a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -549,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x + E850 = ("The PretrainVectors objective currently only supports default " + "vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " "but found value of '{val}'.") E852 = ("The tar file pulled from the remote attempted an unsafe path " diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index a7d67c6dd..826fddd4f 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -8,6 +8,7 @@ from thinc.loss import Loss from ...util import registry, OOV_RANK from ...errors import Errors from ...attrs import ID +from ...vectors import Mode as VectorsMode import numpy from functools import partial @@ -23,6 +24,8 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: + if vocab.vectors.mode != VectorsMode.default: + raise ValueError(Errors.E850.format(mode=vocab.vectors.mode)) if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index 9359c8485..c0d64f1e7 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -2,17 +2,19 @@ from pathlib import Path import numpy as np import pytest import srsly -from spacy.vocab import Vocab -from thinc.api import Config +from thinc.api import Config, get_current_ops +from spacy import util +from spacy.lang.en import English +from spacy.training.initialize import init_nlp +from spacy.training.loop import train +from spacy.training.pretrain import pretrain +from spacy.tokens import Doc, DocBin +from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH +from spacy.ml.models.multi_task import create_pretrain_vectors +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir -from ... import util -from ...lang.en import English -from ...training.initialize import init_nlp -from ...training.loop import train -from ...training.pretrain import pretrain -from ...tokens import Doc, DocBin -from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH pretrain_string_listener = """ [nlp] @@ -346,3 +348,30 @@ def write_vectors_model(tmp_dir): nlp = English(vocab) nlp.to_disk(nlp_path) return str(nlp_path) + + +def test_pretrain_default_vectors(): + nlp = English() + nlp.add_pipe("tok2vec") + nlp.initialize() + + # default vectors are supported + nlp.vocab.vectors = Vectors(shape=(10, 10)) + create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + + # error for no vectors + with pytest.raises(ValueError, match="E875"): + nlp.vocab.vectors = Vectors() + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + ) + + # error for floret vectors + with pytest.raises(ValueError, match="E850"): + ops = get_current_ops() + nlp.vocab.vectors = Vectors( + data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1 + ) + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + ) From 3bf4539e31c5b2b151f46d22c9fa1a9ef346f972 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 7 Mar 2023 13:29:08 +0100 Subject: [PATCH 5/8] fix types (#12365) --- spacy/lexeme.pyi | 3 ++- spacy/lexeme.pyx | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4fcaa82cf..9b7a6156a 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -25,7 +25,8 @@ class Lexeme: def orth_(self) -> str: ... @property def text(self) -> str: ... - lower: str + orth: int + lower: int norm: int shape: int prefix: int diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 6c66effde..e70feaf9a 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -199,7 +199,7 @@ cdef class Lexeme: return self.orth_ property lower: - """RETURNS (str): Lowercase form of the lexeme.""" + """RETURNS (uint64): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower From e656189ec35b15ea1fedbbafc115b91fea9f5957 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 8 Mar 2023 01:47:45 +0900 Subject: [PATCH 6/8] Change GPU efficient textcat to use CNN, not BOW in generated configs (#11900) * Change GPU efficient textcat to use CNN, not BOW If you generate a config with a textcat component using GPU (transformers), the defaut option (efficiency) uses a BOW architecture, which does not use tok2vec features. While that can make sense as part of a larger pipeline, in the case of just a transformer and a textcat, that means the transformer is doing a lot of work for no purpose. This changes it so that the CNN architecture is used instead. It could also be changed to be the same as the accuracy config, which uses the ensemble architecture. * Add the transformer when using a textcat with GPU * Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6 * Require thinc v8.1.7 * Require thinc v8.1.8 * Break up longer expression --------- Co-authored-by: Adriane Boyd --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 +-- spacy/cli/templates/quickstart_training.jinja | 31 ++++++++++++++----- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7abd7a96f..9cd96ac2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.1.0,<8.2.0", + "thinc>=8.1.8,<8.2.0", "numpy>=1.15.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index bc9fc183c..63e03d558 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.1.0,<8.2.0 +thinc>=8.1.8,<8.2.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 diff --git a/setup.cfg b/setup.cfg index cddc5148c..27499805b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.1.0,<8.2.0 + thinc>=8.1.8,<8.2.0 install_requires = # Our libraries spacy-legacy>=3.0.11,<3.1.0 @@ -47,7 +47,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.1.0,<8.2.0 + thinc>=8.1.8,<8.2.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b961ac892..441189341 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -24,8 +24,11 @@ gpu_allocator = null lang = "{{ lang }}" {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} {%- set with_accuracy = optimize == "accuracy" -%} -{%- set has_accurate_textcat = has_textcat and with_accuracy -%} -{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} +{# The BOW textcat doesn't need a source of features, so it can omit the +tok2vec/transformer. #} +{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} +{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} +{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- else -%} {%- set full_pipeline = components -%} @@ -221,10 +224,16 @@ no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v2" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = true -ngram_size = 1 -no_output_layer = false +nO = null + +[components.textcat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {%- endif %} {%- endif %} @@ -252,10 +261,16 @@ no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v2" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = false -ngram_size = 1 -no_output_layer = false +nO = null + +[components.textcat_multilabel.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat_multilabel.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {%- endif %} {%- endif %} From f53d945b2dcbbd81def1a140e4279e1118c2c0b6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Mar 2023 10:22:23 +0100 Subject: [PATCH 7/8] Fix merge --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d0f38084d..9cd96ac2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,6 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.1.6,<8.2.0", "thinc>=8.1.8,<8.2.0", "numpy>=1.15.0", ] From 2713890eccbbaaa8fedfbbad86dc208c4b15a9de Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Mar 2023 10:59:24 +0100 Subject: [PATCH 8/8] Update website/docs/api/spancategorizer.mdx --- website/docs/api/spancategorizer.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index c9ae8e483..c7de2324b 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -68,11 +68,11 @@ architectures and their arguments and hyperparameters. > "spans_key": "labeled_spans", > "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, > "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, -> # Additional spancat_exclusive parameters +> # Additional spancat_singlelabel parameters > "negative_weight": 0.8, > "allow_overlap": True, > } -> nlp.add_pipe("spancat_exclusive", config=config) +> nlp.add_pipe("spancat_singlelabel", config=config) > ``` | Setting | Description |