diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 7c3c3e0a6..d1154756c 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -69,6 +69,11 @@ steps: # displayName: 'Test skip re-download (#12188)' # condition: eq(variables['python_version'], '3.8') +# - script: | +# python -W error -m spacy info ca_core_news_sm | grep -q download_url +# displayName: 'Test download_url in info CLI' +# condition: eq(variables['python_version'] '3.8') + - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . displayName: 'Test convert CLI' diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 70882c3cc..555322782 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -16,7 +16,7 @@ jobs: with: ref: ${{ github.head_ref }} - uses: actions/setup-python@v4 - - run: pip install black + - run: pip install black -c requirements.txt - name: Auto-format code if needed run: black spacy # We can't run black --check here because that returns a non-zero excit diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1a7c0c9a4..3c0b27c1d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its Python modules. If you've built spaCy from source, you'll already have both tools installed. +As a general rule of thumb, we use f-strings for any formatting of strings. +One exception are calls to Python's `logging` functionality. +To avoid unnecessary string conversions in these cases, we use string formatting +templates with `%s` and `%d` etc. + **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a6a575315..9b7ebbe01 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -41,7 +41,7 @@ jobs: inputs: versionSpec: "3.8" - script: | - pip install black==22.3.0 + pip install black -c requirements.txt python -m black spacy --check displayName: "black" - script: | diff --git a/requirements.txt b/requirements.txt index 78cccfbf1..6f4b61918 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.990,<0.1000; platform_machine != "aarch64" +mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" +types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=22.0,<23.0 +black==22.3.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 42883f896..d763fba1f 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -90,9 +90,9 @@ def parse_config_overrides( cli_overrides = _parse_overrides(args, is_cli=True) if cli_overrides: keys = [k for k in cli_overrides if k not in env_overrides] - logger.debug(f"Config overrides from CLI: {keys}") + logger.debug("Config overrides from CLI: %s", keys) if env_overrides: - logger.debug(f"Config overrides from env variables: {list(env_overrides)}") + logger.debug("Config overrides from env variables: %s", list(env_overrides)) return {**cli_overrides, **env_overrides} diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..23b69a81d 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,10 +1,10 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer import srsly +import importlib.metadata from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib.metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 324c5d1bb..6351f28eb 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -252,7 +252,7 @@ def get_third_party_dependencies( raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file - modules.add(func_info["module"].split(".")[0]) # type: ignore[index] + modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr] dependencies = [] for module_name in modules: if module_name in distributions: diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 6e3cde88c..8894baa50 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # in the list. while commands: for i, cmd in enumerate(list(commands)): - logger.debug(f"CMD: {cmd['name']}.") + logger.debug("CMD: %s.", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if all(dep.exists() for dep in deps): cmd_hash = get_command_hash("", "", deps, cmd["script"]) for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) logger.debug( - f"URL: {url} for {output_path} with command hash {cmd_hash}" + "URL: %s for %s with command hash %s", + url, + output_path, + cmd_hash, ) yield url, output_path @@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): commands.pop(i) break else: - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") + logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) else: # If we didn't break the for loop, break the while loop. break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py index bc779e9cd..a8178de21 100644 --- a/spacy/cli/project/push.py +++ b/spacy/cli/project/push.py @@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) for cmd in config.get("commands", []): - logger.debug(f"CMD: cmd['name']") + logger.debug("CMD: %s", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if any(not dep.exists() for dep in deps): - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") + logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) continue cmd_hash = get_command_hash( "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] ) - logger.debug(f"CMD_HASH: {cmd_hash}") + logger.debug("CMD_HASH: %s", cmd_hash) for output_path in cmd.get("outputs", []): output_loc = project_dir / output_path if output_loc.exists() and _is_not_empty_dir(output_loc): @@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): content_hash=get_content_hash(output_loc), ) logger.debug( - f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" + "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash ) yield output_path, url diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/errors.py b/spacy/errors.py index 56cdde409..46de2d41e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -84,7 +84,7 @@ class Warnings(metaclass=ErrorsWithCodes): "ignoring the duplicate entry.") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") - W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " + W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " "the Knowledge Base.") W026 = ("Unable to set all sentence boundaries from dependency parses. If " "you are constructing a parse tree incrementally by setting " @@ -212,7 +212,11 @@ class Warnings(metaclass=ErrorsWithCodes): "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + # v4 warning strings W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") + W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " + "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " + "to return `True` in `.supports_prior_probs`.") class Errors(metaclass=ErrorsWithCodes): @@ -440,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes): E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E139 = ("Knowledge base for component '{name}' is empty. Use the methods " - "`kb.add_entity` and `kb.add_alias` to add entries.") + E139 = ("Knowledge base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -954,7 +957,7 @@ class Errors(metaclass=ErrorsWithCodes): E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " - "or use `auto_switch_port=True` to pick an available port automatically.") + "or use `auto_select_port=True` to pick an available port automatically.") # v4 error strings E4000 = ("Expected a Doc as input, but got: '{type}'") @@ -964,7 +967,9 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") - E4005 = ("Required lemmatizer table(s) {missing_tables} not found in " + E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") + E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") + E4007 = ("Required lemmatizer table(s) {missing_tables} not found in " "[initialize] or in registered lookups (spacy-lookups-data). An " "example for how to load lemmatizer tables in [initialize]:\n\n" "[initialize.components]\n\n" @@ -975,7 +980,8 @@ class Errors(metaclass=ErrorsWithCodes): f'url = "{about.__lookups_url__}"\n' "tables = {tables}\n" "# or required tables only: tables = {required_tables}\n") - E4006 = ("Server error ({status_code}), couldn't fetch {url}") + E4008 = ("Server error ({status_code}), couldn't fetch {url}") + RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index 1d70a9b34..c8a657d62 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -1,3 +1,5 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB -from .candidate import Candidate, get_candidates, get_candidates_batch +from .candidate import Candidate, InMemoryCandidate + +__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 942ce9dd0..f21f423e4 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -1,12 +1,15 @@ -from .kb cimport KnowledgeBase from libcpp.vector cimport vector +from .kb_in_memory cimport InMemoryLookupKB from ..typedefs cimport hash_t -# Object used by the Entity Linker that summarizes one entity-alias candidate combination. cdef class Candidate: - cdef readonly KnowledgeBase kb - cdef hash_t entity_hash - cdef float entity_freq - cdef vector[float] entity_vector - cdef hash_t alias_hash - cdef float prior_prob + pass + + +cdef class InMemoryCandidate(Candidate): + cdef readonly hash_t _entity_hash + cdef readonly hash_t _alias_hash + cpdef vector[float] _entity_vector + cdef float _prior_prob + cdef readonly InMemoryLookupKB _kb + cdef float _entity_freq diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index c89efeb03..3d8da4b95 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -1,74 +1,96 @@ # cython: infer_types=True, profile=True -from typing import Iterable -from .kb cimport KnowledgeBase -from ..tokens import Span +from .kb_in_memory cimport InMemoryLookupKB +from ..errors import Errors cdef class Candidate: - """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved - to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking + """A `Candidate` object refers to a textual mention that may or may not be resolved + to a specific entity from a Knowledge Base. This will be used as input for the entity linking algorithm which will disambiguate the various candidates to the correct one. - Each candidate (alias, entity) pair is assigned a certain prior probability. + Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, + is assigned a certain prior probability. DOCS: https://spacy.io/api/kb/#candidate-init """ - def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): - self.kb = kb - self.entity_hash = entity_hash - self.entity_freq = entity_freq - self.entity_vector = entity_vector - self.alias_hash = alias_hash - self.prior_prob = prior_prob + def __init__(self): + # Make sure abstract Candidate is not instantiated. + if self.__class__ == Candidate: + raise TypeError( + Errors.E1046.format(cls_name=self.__class__.__name__) + ) @property - def entity(self) -> int: - """RETURNS (uint64): hash of the entity's KB ID/name""" - return self.entity_hash + def entity_id(self) -> int: + """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, + otherwise the hash of the entity ID string).""" + raise NotImplementedError @property - def entity_(self) -> str: - """RETURNS (str): ID/name of this entity in the KB""" - return self.kb.vocab.strings[self.entity_hash] + def entity_id_(self) -> str: + """RETURNS (str): String representation of entity ID.""" + raise NotImplementedError @property - def alias(self) -> int: - """RETURNS (uint64): hash of the alias""" - return self.alias_hash + def entity_vector(self) -> vector[float]: + """RETURNS (vector[float]): Entity vector.""" + raise NotImplementedError + + +cdef class InMemoryCandidate(Candidate): + """Candidate for InMemoryLookupKB.""" + + def __init__( + self, + kb: InMemoryLookupKB, + entity_hash: int, + alias_hash: int, + entity_vector: vector[float], + prior_prob: float, + entity_freq: float + ): + """ + kb (InMemoryLookupKB]): InMemoryLookupKB instance. + entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). + entity_freq (int): Entity frequency in KB corpus. + entity_vector (List[float]): Entity embedding. + alias_hash (int): Alias hash. + prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of + the context, this alias - which matches one of this entity's aliases - resolves to one this entity. + """ + super().__init__() + + self._entity_hash = entity_hash + self._entity_vector = entity_vector + self._prior_prob = prior_prob + self._kb = kb + self._alias_hash = alias_hash + self._entity_freq = entity_freq @property - def alias_(self) -> str: - """RETURNS (str): ID of the original alias""" - return self.kb.vocab.strings[self.alias_hash] + def entity_id(self) -> int: + return self._entity_hash @property - def entity_freq(self) -> float: - return self.entity_freq - - @property - def entity_vector(self) -> Iterable[float]: - return self.entity_vector + def entity_vector(self) -> vector[float]: + return self._entity_vector @property def prior_prob(self) -> float: - return self.prior_prob + """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to + this entity.""" + return self._prior_prob + @property + def alias(self) -> str: + """RETURNS (str): Alias.""" + return self._kb.vocab.strings[self._alias_hash] -def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: - """ - Return candidate entities for a given mention and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Span): Entity mention for which to identify candidates. - RETURNS (Iterable[Candidate]): Identified candidates. - """ - return kb.get_candidates(mention) + @property + def entity_id_(self) -> str: + return self._kb.vocab.strings[self._entity_hash] - -def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: - """ - Return candidate entities for the given mentions and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Iterable[Span]): Entity mentions for which to identify candidates. - RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. - """ - return kb.get_candidates_batch(mentions) + @property + def entity_freq(self) -> float: + """RETURNS (float): Entity frequency in KB corpus.""" + return self._entity_freq diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index ce4bc0138..2d0e1d5a1 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union from cymem.cymem cimport Pool from .candidate import Candidate -from ..tokens import Span +from ..tokens import Span, SpanGroup from ..util import SimpleFrozenList from ..errors import Errors @@ -30,21 +30,23 @@ cdef class KnowledgeBase: self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: + def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: """ - Return candidate entities for specified texts. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If no candidate is found for a given text, an empty list is returned. - mentions (Iterable[Span]): Mentions for which to get candidates. + Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If no candidates are found for a given mention, an empty list is returned. + mentions (SpanGroup): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return [self.get_candidates(span) for span in mentions] def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified text. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If the no candidate is found for a given text, an empty list is returned. + Return candidate entities for a specific mention. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If no candidate is found for the given mention, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. """ @@ -106,3 +108,10 @@ cdef class KnowledgeBase: raise NotImplementedError( Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) + + @property + def supports_prior_probs(self) -> bool: + """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" + raise NotImplementedError( + Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) + ) diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index edba523cf..c9ced8309 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -18,7 +18,7 @@ from .. import util from ..util import SimpleFrozenList, ensure_path from ..vocab cimport Vocab from .kb cimport KnowledgeBase -from .candidate import Candidate as Candidate +from .candidate import InMemoryCandidate cdef class InMemoryLookupKB(KnowledgeBase): @@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + def is_empty(self): + return len(self) == 0 + def __len__(self): return self.get_size_entities() @@ -223,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_candidates(self, mention: Span) -> Iterable[Candidate]: - return self.get_alias_candidates(mention.text) # type: ignore + def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: + return self._get_alias_candidates(mention.text) # type: ignore - def get_alias_candidates(self, str alias) -> Iterable[Candidate]: + def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. @@ -238,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - return [Candidate(kb=self, - entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].freq, - entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - alias_hash=alias_hash, - prior_prob=prior_prob) - for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) - if entry_index != 0] + return [ + InMemoryCandidate( + kb=self, + entity_hash=self._entries[entry_index].entity_hash, + alias_hash=alias_hash, + entity_vector=self._vectors_table[self._entries[entry_index].vector_index], + prior_prob=prior_prob, + entity_freq=self._entries[entry_index].freq + ) + for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) + if entry_index != 0 + ] def get_vector(self, str entity): cdef hash_t entity_hash = self.vocab.strings[entity] @@ -276,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): return 0.0 + def supports_prior_probs(self) -> bool: + return True + def to_bytes(self, **kwargs): """Serialize the current state to a binary string. """ diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6963e8b79..28e5085a8 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer - - -# Punctuation stolen from Danish -from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES class SwedishDefaults(BaseDefaults): diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py new file mode 100644 index 000000000..67f1bcdc4 --- /dev/null +++ b/spacy/lang/sv/punctuation.py @@ -0,0 +1,33 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] +_suffixes += [r"(?<=[^sSxXzZ])\'"] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/language.py b/spacy/language.py index c5750ea85..23280be7b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2065,7 +2065,7 @@ class Language: pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: - util.logger.debug(f"Replacing listeners of component '{pipe_name}'") + util.logger.debug("Replacing listeners of component '%s'", pipe_name) if len(list(listeners)) != len(pipe_listeners): # The number of listeners defined in the component model doesn't # match the listeners to replace, so we won't be able to update diff --git a/spacy/lookups.py b/spacy/lookups.py index 0e6fb3b7c..35fbd54b6 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -30,7 +30,7 @@ def load_lookups_data_from_url(lang, tables, url): r = requests.get(table_url) if r.status_code != 200: raise ValueError( - Errors.E4006.format(status_code=r.status_code, url=table_url) + Errors.E4008.format(status_code=r.status_code, url=table_url) ) table_data = r.json() lookups.add_table(table, table_data) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 4c6004907..e2a1b8a3b 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -82,8 +82,12 @@ cdef class DependencyMatcher: "$-": self._imm_left_sib, "$++": self._right_sib, "$--": self._left_sib, + ">+": self._imm_right_child, + ">-": self._imm_left_child, ">++": self._right_child, ">--": self._left_child, + "<+": self._imm_right_parent, + "<-": self._imm_left_parent, "<++": self._right_parent, "<--": self._left_parent, } @@ -427,12 +431,34 @@ cdef class DependencyMatcher: def _left_sib(self, doc, node): return [doc[child.i] for child in doc[node].head.children if child.i < node] + def _imm_right_child(self, doc, node): + for child in doc[node].children: + if child.i == node + 1: + return [doc[child.i]] + return [] + + def _imm_left_child(self, doc, node): + for child in doc[node].children: + if child.i == node - 1: + return [doc[child.i]] + return [] + def _right_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i > node] def _left_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i < node] + def _imm_right_parent(self, doc, node): + if doc[node].head.i == node + 1: + return [doc[node].head] + return [] + + def _imm_left_parent(self, doc, node): + if doc[node].head.i == node - 1: + return [doc[node].head] + return [] + def _right_parent(self, doc, node): if doc[node].head.i > node: return [doc[node].head] diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 498689a7c..17bdfd394 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store): return attr_values +def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): + # tuple order affects performance + return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) + + # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. @@ -848,7 +853,7 @@ class _FuzzyPredicate: fuzz = self.predicate[len("FUZZY"):] # number after prefix self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy_compare = fuzzy_compare - self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) def __call__(self, Token token): if self.is_extension: @@ -870,7 +875,7 @@ class _RegexPredicate: self.value = re.compile(value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -906,7 +911,7 @@ class _SetPredicate: self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -978,7 +983,7 @@ class _ComparisonPredicate: self.value = value self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, if isinstance(value, dict): for type_, cls in predicate_types.items(): if type_ in value: - key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) + key = _predicate_cache_key(attr, type_, value[type_]) if key in seen_predicates: output.append(seen_predicates[key]) else: diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..b5122b164 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -6,9 +6,9 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB -from ...kb import Candidate, get_candidates, get_candidates_batch +from ...kb import Candidate from ...vocab import Vocab -from ...tokens import Span, Doc +from ...tokens import Doc, Span, SpanGroup from ..extract_spans import extract_spans from ...errors import Errors @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, @@ -106,6 +114,28 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateBatchGenerator.v1") def create_candidates_batch() -> Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ]: return get_candidates_batch + + +def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: + """ + Return candidate entities for a given mention and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Span): Entity mention for which to identify candidates. + RETURNS (Iterable[Candidate]): Identified candidates. + """ + return kb.get_candidates(mention) + + +def get_candidates_batch( + kb: KnowledgeBase, mentions: SpanGroup +) -> Iterable[Iterable[Candidate]]: + """ + Return candidate entities for the given mentions and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mentions (SpanGroup): Entity mentions for which to identify candidates. + RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. + """ + return kb.get_candidates_batch(mentions) diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx index 79be13b00..9b2114900 100644 --- a/spacy/ml/tb_framework.pyx +++ b/spacy/ml/tb_framework.pyx @@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, cdef np.ndarray step_actions scores = [] - while sizes.states >= 1: + while sizes.states >= 1 and (actions is None or len(actions) > 0): step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") step_actions = actions[0] if actions is not None else None + assert step_actions is None or step_actions.size == sizes.states, \ + f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})" with nogil: _predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) if actions is None: diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 63d5cccc2..ecd156db5 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,5 @@ -from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any -from typing import cast +import warnings +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast from numpy import dtype from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from pathlib import Path @@ -10,14 +10,15 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb from ..tokens import Doc, Span +from ..ml import empty_kb +from ..tokens import Doc, Span, SpanGroup from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer @@ -58,6 +59,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, "overwrite": False, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, @@ -82,8 +84,9 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -104,8 +107,9 @@ def make_entity_linker( get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] + Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -114,28 +118,9 @@ def make_entity_linker( prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - try: - from spacy_legacy.components.entity_linker import EntityLinker_v1 - except: - raise ImportError( - "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." - ) - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) + raise ValueError(Errors.E4005) + return EntityLinker( nlp.vocab, model, @@ -147,6 +132,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -186,8 +172,9 @@ class EntityLinker(TrainablePipe): entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -209,9 +196,10 @@ class EntityLinker(TrainablePipe): get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], + Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another @@ -219,6 +207,7 @@ class EntityLinker(TrainablePipe): candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/entitylinker#init """ @@ -235,6 +224,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -242,9 +232,7 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size @@ -253,6 +241,8 @@ class EntityLinker(TrainablePipe): if candidates_batch_size < 1: raise ValueError(Errors.E1044) + if self.incl_prior and not self.kb.supports_prior_probs: + warnings.warn(Warnings.W401) def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -266,7 +256,7 @@ class EntityLinker(TrainablePipe): # Raise an error if the knowledge base is not initialized. if self.kb is None: raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: + if hasattr(self.kb, "is_empty") and self.kb.is_empty(): raise ValueError(Errors.E139.format(name=self.name)) def initialize( @@ -485,7 +475,8 @@ class EntityLinker(TrainablePipe): batch_candidates = list( self.get_candidates_batch( - self.kb, [ent_batch[idx] for idx in valid_ent_idx] + self.kb, + SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]), ) if self.candidates_batch_size > 1 else [ @@ -535,18 +526,19 @@ class EntityLinker(TrainablePipe): ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) + final_kb_ids.append(candidates[0].entity_id_) self._add_activations( doc_scores=doc_scores, doc_ents=doc_ents, scores=[1.0], - ents=[candidates[0].entity_], + ents=[candidates[0].entity_id], ) else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: + if self.incl_prior and self.kb.supports_prior_probs: + prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore + else: prior_probs = xp.asarray([0.0 for _ in candidates]) scores = prior_probs # add in similarity from the context @@ -570,7 +562,7 @@ class EntityLinker(TrainablePipe): raise ValueError(Errors.E161) scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( - candidates[scores.argmax().item()].entity_ + candidates[scores.argmax().item()].entity_id_ if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL @@ -579,7 +571,7 @@ class EntityLinker(TrainablePipe): doc_scores=doc_scores, doc_ents=doc_ents, scores=scores, - ents=[c.entity for c in candidates], + ents=[c.entity_id for c in candidates], ) self._add_doc_activations( docs_scores=docs_scores, diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 03495ba74..a7fe0bd40 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -167,7 +167,7 @@ class Lemmatizer(Pipe): missing_tables = set(required_tables) - set(lookups.tables) if len(missing_tables) > 0: raise ValueError( - Errors.E4005.format( + Errors.E4007.format( missing_tables=list(missing_tables), pipe_name=self.name, required_tables=srsly.json_dumps(required_tables), diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index c742aaeaa..d9639f8d5 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,6 @@ -from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any +from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from thinc.types import Floats2d from itertools import islice from .trainable_pipe import TrainablePipe @@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#update """ - if losses is None: - losses = {} validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - tokvecs, bp_tokvecs = self.model.begin_update(docs) - d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - losses.setdefault(self.name, 0.0) - - def accumulate_gradient(one_d_tokvecs): - """Accumulate tok2vec loss and gradient. This is passed as a callback - to all but the last listener. Only the last one does the backprop. - """ - nonlocal d_tokvecs - for i in range(len(one_d_tokvecs)): - d_tokvecs[i] += one_d_tokvecs[i] - losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) - return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - - def backprop(one_d_tokvecs): - """Callback to actually do the backprop. Passed to last listener.""" - accumulate_gradient(one_d_tokvecs) - d_docs = bp_tokvecs(d_tokvecs) - if sgd is not None: - self.finish_update(sgd) - return d_docs - - batch_id = Tok2VecListener.get_batch_id(docs) - for listener in self.listeners[:-1]: - listener.receive(batch_id, tokvecs, accumulate_gradient) - if self.listeners: - self.listeners[-1].receive(batch_id, tokvecs, backprop) - return losses + return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses) def get_loss(self, examples, scores) -> None: pass @@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe): def add_label(self, label): raise NotImplementedError + def distill( + self, + teacher_pipe: Optional["TrainablePipe"], + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Performs an update of the student pipe's model using the + student's distillation examples and sets the annotations + of the teacher's distillation examples using the teacher pipe. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use + for prediction. + examples (Iterable[Example]): Distillation examples. The reference (teacher) + and predicted (student) docs must have the same number of tokens and the + same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/tok2vec#distill + """ + # By default we require a teacher pipe, but there are downstream + # implementations that don't require a pipe. + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + teacher_docs = [eg.reference for eg in examples] + student_docs = [eg.predicted for eg in examples] + teacher_preds = teacher_pipe.predict(teacher_docs) + teacher_pipe.set_annotations(teacher_docs, teacher_preds) + return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses) + + def _update_with_docs( + self, + docs: Iterable[Doc], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + set_dropout_rate(self.model, drop) + + tokvecs, accumulate_gradient, backprop = self._create_backprops( + docs, losses, sgd=sgd + ) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners[:-1]: + listener.receive(batch_id, tokvecs, accumulate_gradient) + if self.listeners: + self.listeners[-1].receive(batch_id, tokvecs, backprop) + return losses + + def _create_backprops( + self, + docs: Iterable[Doc], + losses: Dict[str, float], + *, + sgd: Optional[Optimizer] = None, + ) -> Tuple[Floats2d, Callable, Callable]: + tokvecs, bp_tokvecs = self.model.begin_update(docs) + d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def accumulate_gradient(one_d_tokvecs): + """Accumulate tok2vec loss and gradient. This is passed as a callback + to all but the last listener. Only the last one does the backprop. + """ + nonlocal d_tokvecs + for i in range(len(one_d_tokvecs)): + d_tokvecs[i] += one_d_tokvecs[i] + losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def backprop(one_d_tokvecs): + """Callback to actually do the backprop. Passed to last listener.""" + accumulate_gradient(one_d_tokvecs) + d_docs = bp_tokvecs(d_tokvecs) + if sgd is not None: + self.finish_update(sgd) + return d_docs + + return tokvecs, accumulate_gradient, backprop + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9e50dd7b2..2d2a36252 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -36,6 +36,11 @@ from ..errors import Errors, Warnings from .. import util +# TODO: Remove when we switch to Cython 3. +cdef extern from "" namespace "std" nogil: + bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + + + NUMPY_OPS = NumpyOps() @@ -253,8 +258,8 @@ class Parser(TrainablePipe): # batch uniform length. Since we do not have a gold standard # sequence, we use the teacher's predictions as the gold # standard. - max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states = self._init_batch(teacher_pipe, student_docs, max_moves) + max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) + states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves) else: states = self.moves.init_batch(student_docs) @@ -265,12 +270,12 @@ class Parser(TrainablePipe): # gradients of the student's transition distributions relative to the # teacher's distributions. - student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves, - max_moves=max_moves) + student_inputs = TransitionModelInputs(docs=student_docs, + states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = states2actions(student_states) + actions = _states_diff_to_actions(states, student_states) teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], - moves=self.moves, actions=actions) + states=states, moves=teacher_pipe.moves, actions=actions) (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) @@ -522,7 +527,7 @@ class Parser(TrainablePipe): set_dropout_rate(self.model, 0.0) student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = states2actions(student_states) + actions = _states_to_actions(student_states) teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) _, teacher_scores = self._rehearsal_model.predict(teacher_inputs) @@ -642,7 +647,7 @@ class Parser(TrainablePipe): raise ValueError(Errors.E149) from None return self - def _init_batch(self, teacher_step_model, docs, max_length): + def _init_batch_from_teacher(self, teacher_pipe, docs, max_length): """Make a square batch of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -651,10 +656,12 @@ class Parser(TrainablePipe): _init_gold_batch, this version uses a teacher model to generate the cut sequences.""" cdef: - StateClass start_state StateClass state - Transition action - all_states = self.moves.init_batch(docs) + TransitionSystem moves = teacher_pipe.moves + + # Start with the same heuristic as in supervised training: exclude + # docs that are within the maximum length. + all_states = moves.init_batch(docs) states = [] to_cut = [] for state, doc in zip(all_states, docs): @@ -663,18 +670,28 @@ class Parser(TrainablePipe): states.append(state) else: to_cut.append(state) + + if not to_cut: + return states + + # Parse the states that are too long with the teacher's parsing model. + teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, + states=[state.copy() for state in to_cut]) + (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs) + + # Step through the teacher's actions and store every state after + # each multiple of max_length. + teacher_actions = _states_to_actions(teacher_states) while to_cut: states.extend(state.copy() for state in to_cut) - # Move states forward max_length actions. - length = 0 - while to_cut and length < max_length: - teacher_scores = teacher_step_model.predict(to_cut) - self.transition_states(to_cut, teacher_scores) - # States that are completed do not need further cutting. - to_cut = [state for state in to_cut if not state.is_final()] - length += 1 - return states + for step_actions in teacher_actions[:max_length]: + to_cut = moves.apply_actions(to_cut, step_actions) + teacher_actions = teacher_actions[max_length:] + if len(teacher_actions) < max_length: + break + + return states def _init_gold_batch(self, examples, max_length): """Make a square batch, of length equal to the shortest transition @@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs): model.attrs[key] = value -def states2actions(states: List[StateClass]) -> List[Ints1d]: +def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: cdef int step cdef StateClass state cdef StateC* c_state @@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]: actions.append(numpy.array(step_actions, dtype="i")) return actions + +def _states_diff_to_actions( + before_states: List[StateClass], + after_states: List[StateClass] +) -> List[Ints1d]: + """ + Return for two sets of states the actions to go from the first set of + states to the second set of states. The histories of the first set of + states must be a prefix of the second set of states. + """ + cdef StateClass before_state, after_state + cdef StateC* c_state_before + cdef StateC* c_state_after + + assert len(before_states) == len(after_states) + + # Check invariant: before states histories must be prefixes of after states. + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + + assert equal(c_state_before.history.begin(), c_state_before.history.end(), + c_state_after.history.begin()) + + actions = [] + while True: + step = len(actions) + + step_actions = [] + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + if step < c_state_after.history.size() - c_state_before.history.size(): + step_actions.append(c_state_after.history[c_state_before.history.size() + step]) + + # We are done if we have exhausted all histories. + if len(step_actions) == 0: + break + + actions.append(numpy.array(step_actions, dtype="i")) + + return actions diff --git a/spacy/strings.pyi b/spacy/strings.pyi index d9509ff57..38dee7034 100644 --- a/spacy/strings.pyi +++ b/spacy/strings.pyi @@ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo from pathlib import Path class StringStore: - def __init__(self, strings: Optional[Iterable[str]]) -> None: ... + def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ... @overload def __getitem__(self, string_or_hash: str) -> int: ... @overload diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index bbb0ff415..0aa495992 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 + + +@pytest.mark.issue(12311) +@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"]) +def test_sv_tokenizer_handles_colon(sv_tokenizer, text): + tokens = sv_tokenizer(text) + assert len(tokens) == 1 diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index b4e19d69d..200384320 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): ("the", "brown", "$--", 0), ("brown", "the", "$--", 1), ("brown", "brown", "$--", 0), + ("over", "jumped", "<+", 0), + ("quick", "fox", "<+", 0), + ("the", "quick", "<+", 0), + ("brown", "fox", "<+", 1), ("quick", "fox", "<++", 1), ("quick", "over", "<++", 0), ("over", "jumped", "<++", 0), ("the", "fox", "<++", 2), + ("brown", "fox", "<-", 0), + ("fox", "over", "<-", 0), + ("the", "over", "<-", 0), + ("over", "jumped", "<-", 1), ("brown", "fox", "<--", 0), ("fox", "jumped", "<--", 0), ("fox", "over", "<--", 1), + ("fox", "brown", ">+", 0), + ("over", "fox", ">+", 0), + ("over", "the", ">+", 0), + ("jumped", "over", ">+", 1), ("jumped", "over", ">++", 1), ("fox", "lazy", ">++", 0), ("over", "the", ">++", 0), + ("jumped", "over", ">-", 0), + ("fox", "quick", ">-", 0), + ("brown", "quick", ">-", 0), + ("fox", "brown", ">-", 1), ("brown", "fox", ">--", 0), ("fox", "brown", ">--", 1), ("jumped", "fox", ">--", 1), diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py new file mode 100644 index 000000000..8c1cf7a93 --- /dev/null +++ b/spacy/tests/parser/test_model.py @@ -0,0 +1,61 @@ +import numpy +import pytest + +from spacy.lang.en import English +from spacy.ml.tb_framework import TransitionModelInputs +from spacy.training import Example + +TRAIN_DATA = [ + ( + "They trade mortgage-backed securities.", + { + "heads": [1, 1, 4, 4, 5, 1, 1], + "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], + }, + ), + ( + "I like London and Berlin.", + { + "heads": [1, 1, 1, 2, 2, 1], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + }, + ), +] + + +@pytest.fixture +def nlp_parser(): + nlp = English() + parser = nlp.add_pipe("parser") + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations["deps"]: + parser.add_label(dep) + nlp.initialize() + + return nlp, parser + + +def test_incorrect_number_of_actions(nlp_parser): + nlp, parser = nlp_parser + doc = nlp.make_doc("test") + + # Too many actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")] + ) + ) + + # Too few actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc, doc], + moves=parser.moves, + actions=[numpy.array([0], dtype="i")], + ) + ) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d6cd11e55..62b8f9704 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -623,7 +623,9 @@ def test_is_distillable(): assert ner.is_distillable -def test_distill(): +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): teacher = English() teacher_ner = teacher.add_pipe("ner") train_examples = [] @@ -641,6 +643,7 @@ def test_distill(): student = English() student_ner = student.add_pipe("ner") + student_ner.cfg["update_with_oracle_cut_size"] = max_moves student_ner.initialize( get_examples=lambda: train_examples, labels=teacher_ner.label_data ) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 57b6e188b..2f2fa397e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -463,7 +463,9 @@ def test_is_distillable(): assert parser.is_distillable -def test_distill(): +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): teacher = English() teacher_parser = teacher.add_pipe("parser") train_examples = [] @@ -481,6 +483,7 @@ def test_distill(): student = English() student_parser = student.add_pipe("parser") + student_parser.cfg["update_with_oracle_cut_size"] = max_moves student_parser.initialize( get_examples=lambda: train_examples, labels=teacher_parser.label_data ) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 506530591..773a5b8f3 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -7,10 +7,10 @@ from thinc.types import Ragged from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase +from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker +from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer @@ -353,6 +353,9 @@ def test_kb_default(nlp): """Test that the default (empty) KB is loaded upon construction""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError, match="E139"): + # this raises an error because the KB is empty + entity_linker.validate_kb() assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 # 64 is the default value from pipeline.entity_linker @@ -462,16 +465,17 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates + adam_ent_cands = get_candidates(mykb, adam_ent) assert len(get_candidates(mykb, douglas_ent)) == 2 - assert len(get_candidates(mykb, adam_ent)) == 1 + assert len(adam_ent_cands) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" - assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) - assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) + assert adam_ent_cands[0].entity_id_ == "Q2" + assert adam_ent_cands[0].alias == "adam" + assert_almost_equal(adam_ent_cands[0].entity_freq, 12) + assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) def test_el_pipe_configuration(nlp): @@ -499,7 +503,7 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" def get_lowercased_candidates(kb, span): - return kb.get_alias_candidates(span.text.lower()) + return kb._get_alias_candidates(span.text.lower()) def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @@ -558,24 +562,22 @@ def test_vocab_serialization(nlp): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) - candidates = mykb.get_alias_candidates("adam") + candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" + assert candidates[0].alias == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") - candidates = kb_new_vocab.get_alias_candidates("adam") + candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" + assert candidates[0].alias == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) @@ -595,20 +597,20 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_alias_candidates("douglas")) == 2 + assert len(mykb._get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 @pytest.mark.filterwarnings("ignore:\\[W036") @@ -905,11 +907,11 @@ def test_kb_to_bytes(): assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() - assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( - kb_2.get_alias_candidates("Russ Cochran") + assert len(kb_1._get_alias_candidates("Russ Cochran")) == len( + kb_2._get_alias_candidates("Russ Cochran") ) - assert len(kb_1.get_alias_candidates("Randomness")) == len( - kb_2.get_alias_candidates("Randomness") + assert len(kb_1._get_alias_candidates("Randomness")) == len( + kb_2._get_alias_candidates("Randomness") ) @@ -990,14 +992,11 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ - ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) # fmt: on def test_legacy_architectures(name, config): - from spacy_legacy.components.entity_linker import EntityLinker_v1 - # Ensure that the legacy architectures still work vector_length = 3 nlp = English() @@ -1019,10 +1018,7 @@ def test_legacy_architectures(name, config): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - if config["@architectures"] == "spacy.EntityLinker.v1": - assert isinstance(entity_linker, EntityLinker_v1) - else: - assert isinstance(entity_linker, EntityLinker) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 9b9786f04..39611a742 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -9,6 +9,7 @@ from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.strings import StringStore from spacy.tokens import Doc from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir @@ -131,7 +132,7 @@ def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans # fmt: off words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] - vocab = Vocab(strings=words) + vocab = Vocab(strings=StringStore(words)) deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index ee62b1ab4..6929b76fa 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat(): assert cats1["imperative"] < 0.9 assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] + + +cfg_string_distillation = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v2" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_distillation_teacher_annotations(): + orig_config = Config().from_str(cfg_string_distillation) + teacher_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + student_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + + train_examples_teacher = [] + train_examples_student = [] + for t in TRAIN_DATA: + train_examples_teacher.append( + Example.from_dict(teacher_nlp.make_doc(t[0]), t[1]) + ) + train_examples_student.append( + Example.from_dict(student_nlp.make_doc(t[0]), t[1]) + ) + + optimizer = teacher_nlp.initialize(lambda: train_examples_teacher) + student_nlp.initialize(lambda: train_examples_student) + + # Since Language.distill creates a copy of the examples to use as + # its internal teacher/student docs, we'll need to monkey-patch the + # tok2vec pipe's distill method. + student_tok2vec = student_nlp.get_pipe("tok2vec") + student_tok2vec._old_distill = student_tok2vec.distill + + def tok2vec_distill_wrapper( + self, + teacher_pipe, + examples, + **kwargs, + ): + assert all(not eg.reference.tensor.any() for eg in examples) + out = self._old_distill(teacher_pipe, examples, **kwargs) + assert all(eg.reference.tensor.any() for eg in examples) + return out + + student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) + student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..eb4254d31 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -63,19 +66,21 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted( + kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_ + ) assert len(candidates) == 2 - assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_id_ == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].alias_ == "double07" + assert candidates[0].alias == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_id_ == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].alias_ == "double07" + assert candidates[1].alias == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 @@ -91,7 +96,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +107,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +117,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +194,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index fd80c3d8e..f6356ac9e 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -13,8 +13,11 @@ from spacy.vocab import Vocab from ..util import make_tempdir -test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] -test_strings_attrs = [(["rats", "are", "cute"], "Hello")] +test_strings = [ + (StringStore(), StringStore()), + (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])), +] +test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")] @pytest.mark.issue(599) @@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() - if strings1 == strings2: + if strings1.to_bytes() == strings2.to_bytes(): assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b @@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() - vocab1[strings[0]].norm_ = lex_attr - assert vocab1[strings[0]].norm_ == lex_attr - assert vocab2[strings[0]].norm_ != lex_attr + s = next(iter(vocab1.strings)) + vocab1[s].norm_ = lex_attr + assert vocab1[s].norm_ == lex_attr + assert vocab2[s].norm_ != lex_attr vocab2 = vocab2.from_bytes(vocab1.to_bytes()) - assert vocab2[strings[0]].norm_ == lex_attr + assert vocab2[s].norm_ == lex_attr @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() - vocab1[strings[0]].norm_ = lex_attr - assert vocab1[strings[0]].norm_ == lex_attr - assert vocab2[strings[0]].norm_ != lex_attr + s = next(iter(vocab1.strings)) + vocab1[s].norm_ = lex_attr + assert vocab1[s].norm_ == lex_attr + assert vocab2[s].norm_ != lex_attr with make_tempdir() as d: file_path = d / "vocab" vocab1.to_disk(file_path) vocab2 = vocab2.from_disk(file_path) - assert vocab2[strings[0]].norm_ == lex_attr + assert vocab2[s].norm_ == lex_attr @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index dc7ce46fe..752750d33 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 40100412a..8aaadf686 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,5 +1,7 @@ import os from pathlib import Path +import pytest +import srsly from typer.testing import CliRunner from spacy.tokens import DocBin, Doc @@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab): # Instead of checking specific wording of the output, which may change, # we'll check that this section of the debug output is present. assert "= Trainable Lemmatizer =" in result_debug_data.stdout + + +# project tests + +SAMPLE_PROJECT = { + "title": "Sample project", + "description": "This is a project for testing", + "assets": [ + { + "dest": "assets/spacy-readme.md", + "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", + "checksum": "411b2c89ccf34288fae8ed126bf652f7", + }, + { + "dest": "assets/citation.cff", + "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", + "checksum": "c996bfd80202d480eb2e592369714e5e", + "extra": True, + }, + ], + "commands": [ + { + "name": "ok", + "help": "print ok", + "script": ["python -c \"print('okokok')\""], + }, + { + "name": "create", + "help": "make a file", + "script": ["touch abc.txt"], + "outputs": ["abc.txt"], + }, + { + "name": "clean", + "help": "remove test file", + "script": ["rm abc.txt"], + }, + ], +} + +SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) + + +@pytest.fixture +def project_dir(): + with make_tempdir() as pdir: + (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) + yield pdir + + +def test_project_document(project_dir): + readme_path = project_dir / "README.md" + assert not readme_path.exists(), "README already exists" + result = CliRunner().invoke( + app, ["project", "document", str(project_dir), "-o", str(readme_path)] + ) + assert result.exit_code == 0 + assert readme_path.is_file() + text = readme_path.read_text("utf-8") + assert SAMPLE_PROJECT["description"] in text + + +def test_project_assets(project_dir): + asset_dir = project_dir / "assets" + assert not asset_dir.exists(), "Assets dir is already present" + result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" + # check that extras work + result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" + + +def test_project_run(project_dir): + # make sure dry run works + test_file = project_dir / "abc.txt" + result = CliRunner().invoke( + app, ["project", "run", "--dry", "create", str(project_dir)] + ) + assert result.exit_code == 0 + assert not test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) + assert result.exit_code == 0 + assert "okokok" in result.stdout + + +@pytest.mark.parametrize( + "options", + [ + "", + # "--sparse", + "--branch v3", + "--repo https://github.com/explosion/projects --branch v3", + ], +) +def test_project_clone(options): + with make_tempdir() as workspace: + out = workspace / "project" + target = "benchmarks/ner_conll03" + if not options: + options = [] + else: + options = options.split() + result = CliRunner().invoke( + app, ["project", "clone", target, *options, str(out)] + ) + assert result.exit_code == 0 + assert (out / "README.md").is_file() + + +def test_project_push_pull(project_dir): + proj = dict(SAMPLE_PROJECT) + remote = "xyz" + + with make_tempdir() as remote_dir: + proj["remotes"] = {remote: str(remote_dir)} + proj_text = srsly.yaml_dumps(proj) + (project_dir / "project.yml").write_text(proj_text) + + test_file = project_dir / "abc.txt" + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) + assert result.exit_code == 0 + result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) + assert result.exit_code == 0 + assert not test_file.exists() + result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 3d0905dd3..9b8c7b9c7 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -98,7 +98,7 @@ def assert_sents_error(doc): def warn_error(proc_name, proc, docs, e): logger = logging.getLogger("spacy") - logger.warning(f"Trouble with component {proc_name}.") + logger.warning("Trouble with component %s.", proc_name) @pytest.fixture diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d91f41db3..cd7f954ae 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2): @pytest.mark.issue(600) def test_issue600(): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + vocab = Vocab() doc = Doc(vocab, words=["hello"]) doc[0].tag_ = "NN" diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 93cd8de05..48bc21c27 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -105,6 +105,7 @@ class Doc: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., @@ -127,12 +128,12 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property - def sents(self) -> Iterator[Span]: ... + def sents(self) -> Tuple[Span]: ... @property def lang(self) -> int: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2eca1aafd..0ea2c39ab 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -520,7 +520,7 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be created. @@ -657,9 +657,6 @@ cdef class Doc: elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector else: return xp.zeros((self.vocab.vectors_length,), dtype="float32") @@ -706,10 +703,10 @@ cdef class Doc: return self.text property ents: - """The named entities in the document. Returns a tuple of named entity + """The named entities in the document. Returns a list of named entity `Span` objects, if the entity recognizer has been applied. - RETURNS (tuple): Entities in the document, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity. DOCS: https://spacy.io/api/doc#ents """ @@ -867,7 +864,7 @@ cdef class Doc: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the document. + RETURNS (Tuple[Span]): Noun chunks in the document. DOCS: https://spacy.io/api/doc#noun_chunks """ @@ -876,36 +873,35 @@ cdef class Doc: # Accumulate the result before beginning to iterate over it. This # prevents the tokenization from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenization changing, so it's okay once we have the Span - # objects. See Issue #375. + # during the iteration. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) - for span in spans: - yield span + return tuple(spans) @property def sents(self): """Iterate over the sentences in the document. Yields sentence `Span` objects. Sentence spans have no label. - YIELDS (Span): Sentences in the document. + RETURNS (Tuple[Span]): Sentences in the document. DOCS: https://spacy.io/api/doc#sents """ if not self.has_annotation("SENT_START"): raise ValueError(Errors.E030) if "sents" in self.user_hooks: - yield from self.user_hooks["sents"](self) + return tuple(self.user_hooks["sents"](self)) else: start = 0 + spans = [] for i in range(1, self.length): if self.c[i].sent_start == 1: - yield Span(self, start, i) + spans.append(Span(self, start, i)) start = i if start != self.length: - yield Span(self, start, self.length) + spans.append(Span(self, start, self.length)) + return tuple(spans) @property def lang(self): @@ -1605,7 +1601,7 @@ cdef class Doc: for span_group in doc_json.get("spans", {}): spans = [] for span in doc_json["spans"][span_group]: - char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"]) + char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"]) if char_span is None: raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"])) spans.append(char_span) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 549990c5e..e5031fea9 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -74,6 +74,8 @@ class Span: @property def ents(self) -> Tuple[Span]: ... @property + def sents(self) -> Tuple[Span]: ... + @property def has_vector(self) -> bool: ... @property def vector(self) -> Floats1d: ... @@ -86,7 +88,7 @@ class Span: @property def text_with_ws(self) -> str: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property def root(self) -> Token: ... def char_span( @@ -94,6 +96,7 @@ class Span: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 4990cb5f7..75f7db7ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -461,20 +461,21 @@ cdef class Span: """Obtain the sentences that contain this span. If the given span crosses sentence boundaries, return all sentences it is a part of. - RETURNS (Iterable[Span]): All sentences that the span is a part of. + RETURNS (Tuple[Span]): All sentences that the span is a part of. - DOCS: https://spacy.io/api/span#sents + DOCS: https://spacy.io/api/span#sents """ cdef int start cdef int i if "sents" in self.doc.user_span_hooks: - yield from self.doc.user_span_hooks["sents"](self) - elif "sents" in self.doc.user_hooks: + return tuple(self.doc.user_span_hooks["sents"](self)) + spans = [] + if "sents" in self.doc.user_hooks: for sentence in self.doc.user_hooks["sents"](self.doc): if sentence.end > self.start: if sentence.start < self.end or sentence.start == self.start == self.end: - yield sentence + spans.append(sentence) else: break else: @@ -489,12 +490,13 @@ cdef class Span: # Now, find all the sentences in the span for i in range(start + 1, self.doc.length): if self.doc.c[i].sent_start == 1: - yield Span(self.doc, start, i) + spans.append(Span(self.doc, start, i)) start = i if start >= self.end: break if start < self.end: - yield Span(self.doc, start, self.end) + spans.append(Span(self.doc, start, self.end)) + return tuple(spans) @property @@ -502,7 +504,7 @@ cdef class Span: """The named entities that fall completely within the span. Returns a tuple of `Span` objects. - RETURNS (tuple): Entities in the span, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity. DOCS: https://spacy.io/api/span#ents """ @@ -517,7 +519,7 @@ cdef class Span: ents.append(ent) else: break - return ents + return tuple(ents) @property def has_vector(self): @@ -532,8 +534,6 @@ cdef class Span: return self.doc.user_span_hooks["has_vector"](self) elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) - elif self.doc.tensor.size > 0: - return True else: return False @@ -615,13 +615,15 @@ cdef class Span: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the span. + RETURNS (Tuple[Span]): Noun chunks in the span. DOCS: https://spacy.io/api/span#noun_chunks """ + spans = [] for span in self.doc.noun_chunks: if span.start >= self.start and span.end <= self.end: - yield span + spans.append(span) + return tuple(spans) @property def root(self): @@ -666,11 +668,11 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. - start (int): The index of the first character of the span. - end (int): The index of the first character after the span. + start_idx (int): The index of the first character of the span. + end_idx (int): The index of the first character after the span. label (Union[int, str]): A label to attach to the Span, e.g. for named entities. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 64c707acd..74f812af7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -389,8 +389,6 @@ cdef class Token: """ if "has_vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["has_vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return True return self.vocab.has_vector(self.c.lex.orth) @property @@ -404,8 +402,6 @@ cdef class Token: """ if "vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return self.doc.tensor[self.i] else: return self.vocab.get_vector(self.c.lex.orth) diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 426fddf90..7e2494f5b 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -11,7 +11,7 @@ def create_copy_from_base_model( ) -> Callable[[Language], Language]: def copy_from_base_model(nlp): if tokenizer: - logger.info(f"Copying tokenizer from: {tokenizer}") + logger.info("Copying tokenizer from: %s", tokenizer) base_nlp = load_model(tokenizer) if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) @@ -23,7 +23,7 @@ def create_copy_from_base_model( ) ) if vocab: - logger.info(f"Copying vocab from: {vocab}") + logger.info("Copying vocab from: %s", vocab) # only reload if the vocab is from a different model if tokenizer != vocab: base_nlp = load_model(vocab) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index d626ad0e0..086ad831c 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -29,7 +29,7 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: if path is None: raise ValueError(Errors.E913) - util.logger.debug(f"Loading corpus from path: {path}") + util.logger.debug("Loading corpus from path: %s", path) return Corpus( path, gold_preproc=gold_preproc, diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 408acdbee..c626cb813 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + logger.info("Pipeline: %s", nlp.pipe_names) if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + logger.info("Resuming training for: %s", resume_components) nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() @@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": if T["max_epochs"] == -1: sample_size = 100 logger.debug( - f"Due to streamed train corpus, using only first {sample_size} " - f"examples for initialization. If necessary, provide all labels " - f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + "Due to streamed train corpus, using only first %s examples for initialization. " + "If necessary, provide all labels in [initialize]. " + "More info: https://spacy.io/api/cli#init_labels", + sample_size, ) nlp.initialize( lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer ) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.info(f"Initialized pipeline components: {nlp.pipe_names}") + logger.info("Initialized pipeline components: %s", nlp.pipe_names) # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: for listener in getattr( @@ -109,7 +110,7 @@ def init_vocab( ) -> None: if lookups: nlp.vocab.lookups = lookups - logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -125,11 +126,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.info(f"Added vectors: {vectors}") + logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) @@ -191,7 +192,7 @@ def init_tok2vec( if weights_data is not None: layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) - logger.info(f"Loaded pretrained weights from {init_tok2vec}") + logger.info("Loaded pretrained weights from %s", init_tok2vec) return True return False @@ -215,13 +216,13 @@ def convert_vectors( nlp.vocab.deduplicate_vectors() else: if vectors_loc: - logger.info(f"Reading vectors from {vectors_loc}") + logger.info("Reading vectors from %s", vectors_loc) vectors_data, vector_keys, floret_settings = read_vectors( vectors_loc, truncate, mode=mode, ) - logger.info(f"Loaded vectors from {vectors_loc}") + logger.info("Loaded vectors from %s", vectors_loc) else: vectors_data, vector_keys = (None, None) if vector_keys is not None and mode != VectorsMode.floret: diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fcc023a0d..c737d7c01 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None: if subdir.exists(): try: shutil.rmtree(str(subdir)) - logger.debug(f"Removed existing output directory: {subdir}") + logger.debug("Removed existing output directory: %s", subdir) except Exception as e: raise IOError(Errors.E901.format(path=path)) from e diff --git a/spacy/util.py b/spacy/util.py index d653e0305..1ce869152 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -33,6 +33,7 @@ import inspect import pkgutil import logging import socket +import stat try: import cupy.random @@ -139,8 +140,17 @@ class registry(thinc.registry): return func @classmethod - def find(cls, registry_name: str, func_name: str) -> Callable: - """Get info about a registered function from the registry.""" + def find( + cls, registry_name: str, func_name: str + ) -> Dict[str, Optional[Union[str, int]]]: + """Find information about a registered function, including the + module and path to the file it's defined in, the line number and the + docstring, if available. + + registry_name (str): Name of the catalogue registry. + func_name (str): Name of the registered function. + RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. + """ # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): @@ -1030,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]: """ d = Path(tempfile.mkdtemp()) yield d + + # On Windows, git clones use read-only files, which cause permission errors + # when being deleted. This forcibly fixes permissions. + def force_remove(rmfunc, path, ex): + os.chmod(path, stat.S_IWRITE) + rmfunc(path) + try: - shutil.rmtree(str(d)) + shutil.rmtree(str(d), onerror=force_remove) except PermissionError as e: warnings.warn(Warnings.W091.format(dir=d, msg=e)) diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 871044fff..e4a88bfd8 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -26,7 +26,7 @@ class Vocab: def __init__( self, lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ..., - strings: Optional[Union[List[str], StringStore]] = ..., + strings: Optional[StringStore] = ..., lookups: Optional[Lookups] = ..., oov_prob: float = ..., writing_system: Dict[str, Any] = ..., diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f3c3595ef..0d3c9c883 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,9 +49,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ - def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, - oov_prob=-20., writing_system={}, get_noun_chunks=None, - **deprecated_kwargs): + def __init__(self, lex_attr_getters=None, strings=None, lookups=None, + oov_prob=-20., writing_system=None, get_noun_chunks=None): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -69,16 +68,19 @@ cdef class Vocab: self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() - self.strings = StringStore() self.length = 0 - if strings: - for string in strings: - _ = self[string] + if strings is None: + self.strings = StringStore() + else: + self.strings = strings self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) self.vectors = Vectors(strings=self.strings) self.lookups = lookups - self.writing_system = writing_system + if writing_system is None: + self.writing_system = {} + else: + self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks property vectors: diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 54b5065e8..ee41144f6 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. @@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. +### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of +[`Span`](/api/span) objects denoting named entities, and returns a list of +plausible [`Candidate`](/api/kb/#candidate) objects per specified +[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a +mention to find its potential aliases in the `KnowledgeBase`. Note that this +function is case-dependent. + ## Coreference {id="coref-architectures",tag="experimental"} A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 868079e8c..1a3f15e48 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir] ### project pull {id="project-pull",tag="command"} Download all files or directories listed as `outputs` for commands, unless they -are not already present locally. When searching for files in the remote, `pull` +are already present locally. When searching for files in the remote, `pull` won't just look at the output path, but will also consider the **command string** and the **hashes of the dependencies**. For instance, let's say you've previously pushed a checkpoint to the remote, but now you've changed some diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 390034a6c..14e0916d1 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index 1a3f6179f..fca056ed0 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -214,6 +214,7 @@ alignment mode `"strict". | `start` | The index of the first character of the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | @@ -653,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer). ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the document. Yields base noun-phrase -`Span` objects, if the document has been syntactically parsed. A base noun -phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be -nested within it – so no NP-level coordination, no prepositional phrases, and no -relative clauses. +Returns a tuple of the base noun phrases in the doc, if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. To customize the noun chunk iterator in a loaded pipeline, modify [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` @@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised. > assert chunks[1].text == "another phrase" > ``` -| Name | Description | -| ---------- | ------------------------------------- | -| **YIELDS** | Noun chunks in the document. ~~Span~~ | +| Name | Description | +| ----------- | -------------------------------------------- | +| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ | ## Doc.sents {id="sents",tag="property",model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. +Returns a tuple of the sentences in the document. Sentence spans have no label. This property is only available when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the @@ -696,9 +696,9 @@ will raise an error otherwise. > assert [s.root.text for s in sents] == ["is", "'s"] > ``` -| Name | Description | -| ---------- | ----------------------------------- | -| **YIELDS** | Sentences in the document. ~~Span~~ | +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ | ## Doc.has_vector {id="has_vector",tag="property",model="vectors"} diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 12b2f6bef..3af7ac4dd 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index c24fe78d6..3b33f7fb7 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -10,9 +10,9 @@ version: 3.5 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and implements all of its methods. It stores all KB data in-memory and generates -[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with -entity names. It's highly optimized for both a low memory footprint and speed of -retrieval. +[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions +with entity names. It's highly optimized for both a low memory footprint and +speed of retrieval. ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"} @@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base. ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"} Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb#candidate). Wraps +of type [`InMemoryCandidate`](/api/kb#candidate). Wraps [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). > #### Example @@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps > candidates = kb.get_candidates(doc[0:2]) > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------- | -| `mention` | The textual mention or alias. ~~Span~~ | -| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------ | +| `mention` | The textual mention or alias. ~~Span~~ | +| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ | ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} @@ -189,31 +189,16 @@ to you. > > ```python > from spacy.lang.en import English +> from spacy.tokens import SpanGroup > nlp = English() > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") -> candidates = kb.get_candidates((doc[0:2], doc[3:])) +> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------------------------- | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | -| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | - -## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"} - -Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb#candidate). - -> #### Example -> -> ```python -> candidates = kb.get_alias_candidates("Douglas") -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | -| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------ | +| `mentions` | The textual mentions. ~~SpanGroup~~ | +| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 2b0d4d9d6..94506162f 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -93,33 +93,17 @@ to you. > > ```python > from spacy.lang.en import English +> from spacy.tokens import SpanGroup > nlp = English() > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") -> candidates = kb.get_candidates((doc[0:2], doc[3:])) +> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) > ``` | Name | Description | | ----------- | -------------------------------------------------------------------------------------------- | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | +| `mentions` | The textual mentions. ~~SpanGroup~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | -## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"} - - - This method is _not_ available from spaCy 3.5 onwards. - - -From spaCy 3.5 on `KnowledgeBase` is an abstract class (with -[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to -allow more flexibility in customizing knowledge bases. Some of its methods were -moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, -one of those being `get_alias_candidates()`. This method is now available as -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). -Note: -[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) -defaults to -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). - ## KnowledgeBase.get_vector {id="get_vector",tag="method"} Given a certain entity ID, retrieve its pretrained entity vector. @@ -190,25 +174,25 @@ Restore the state of the knowledge base from a given directory. Note that the | `exclude` | List of components to exclude. ~~Iterable[str]~~ | | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ | -## Candidate {id="candidate",tag="class"} +## InMemoryCandidate {id="candidate",tag="class"} -A `Candidate` object refers to a textual mention (alias) that may or may not be -resolved to a specific entity from a `KnowledgeBase`. This will be used as input -for the entity linking algorithm which will disambiguate the various candidates -to the correct one. Each candidate `(alias, entity)` pair is assigned to a -certain prior probability. +An `InMemoryCandidate` object refers to a textual mention (alias) that may or +may not be resolved to a specific entity from a `KnowledgeBase`. This will be +used as input for the entity linking algorithm which will disambiguate the +various candidates to the correct one. Each candidate `(alias, entity)` pair is +assigned to a certain prior probability. -### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"} +### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} -Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the `get_candidates` method of the -[`entity_linker`](/api/entitylinker) pipe. +Construct an `InMemoryCandidate` object. Usually this constructor is not called +directly, but instead these objects are returned by the `get_candidates` method +of the [`entity_linker`](/api/entitylinker) pipe. > #### Example > > ```python -> from spacy.kb import Candidate -> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) +> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, +> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > ``` | Name | Description | @@ -216,10 +200,10 @@ but instead these objects are returned by the `get_candidates` method of the | `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | | `entity_hash` | The hash of the entity's KB ID. ~~int~~ | | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | -| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | +| `alias_hash` | The hash of the entity alias. ~~int~~ | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | -## Candidate attributes {id="candidate-attributes"} +## InMemoryCandidate attributes {id="candidate-attributes"} | Name | Description | | --------------- | ------------------------------------------------------------------------ | diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 7e7042866..e1ada3b45 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -188,9 +188,10 @@ the character indices don't map to a valid span. | Name | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | +| `start_idx` | The index of the first character of the span. ~~int~~ | +| `end_idx` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | @@ -274,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of > assert ents[0].text == "Mr. Best" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------- | -| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------ | +| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ | ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` -objects, if the document has been syntactically parsed. A base noun phrase, or -"NP chunk", is a noun phrase that does not permit other NPs to be nested within -it – so no NP-level coordination, no prepositional phrases, and no relative -clauses. +Returns a tuple of the base noun phrases in the span if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) has not been implemeted for the given language, a `NotImplementedError` is @@ -300,9 +300,9 @@ raised. > assert chunks[0].text == "another phrase" > ``` -| Name | Description | -| ---------- | --------------------------------- | -| **YIELDS** | Noun chunks in the span. ~~Span~~ | +| Name | Description | +| ----------- | ---------------------------------------- | +| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ | ## Span.as_doc {id="as_doc",tag="method"} @@ -524,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)] ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} -Returns a generator over the sentences the span belongs to. This property is -only available when [sentence boundaries](/usage/linguistic-features#sbd) have -been set on the document by the `parser`, `senter`, `sentencizer` or some custom +Returns a tuple of the sentences the span belongs to. This property is only +available when [sentence boundaries](/usage/linguistic-features#sbd) have been +set on the document by the `parser`, `senter`, `sentencizer` or some custom function. It will raise an error otherwise. If the span happens to cross sentence boundaries, all sentences the span @@ -540,9 +540,9 @@ overlaps with will be returned. > assert len(span.sents) == 2 > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------- | -| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ | ## Attributes {id="attributes"} diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 7e380f5f8..2425c8adc 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of integer IDs. This ensures that strings always map to the same ID, even from different `StringStores`. + + +Note that a `StringStore` instance is not static. It increases in size as texts +with new tokens are processed. + + + ## StringStore.\_\_init\_\_ {id="init",tag="method"} Create the `StringStore`. diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx index a1bb1265e..8b6d2380b 100644 --- a/website/docs/api/tok2vec.mdx +++ b/website/docs/api/tok2vec.mdx @@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"} + +Performs an update of the student pipe's model using the student's distillation +examples and sets the annotations of the teacher's distillation examples using +the teacher pipe. + +Unlike other trainable pipes, the student pipe doesn't directly learn its +representations from the teacher. However, since downstream pipes that do +perform distillation expect the tok2vec annotations to be present on the +correct distillation examples, we need to ensure that they are set beforehand. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("tok2vec") +> student_pipe = student.add_pipe("tok2vec") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ | +| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Tok2Vec.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 01690f161..5600ab485 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -355,22 +355,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Description | -| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | -| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | -| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | -| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | -| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | -| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | -| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | -| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | -| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | -| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {id="displacy_options-ent"} diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 3faf1f1a0..1e32eb118 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared between `Doc` objects. + + +Note that a `Vocab` instance is not static. It increases in size as texts with +new tokens are processed. + + + ## Vocab.\_\_init\_\_ {id="init",tag="method"} Create the vocabulary. @@ -17,14 +24,15 @@ Create the vocabulary. > #### Example > > ```python +> from spacy.strings import StringStore > from spacy.vocab import Vocab -> vocab = Vocab(strings=["hello", "world"]) +> vocab = Vocab(strings=StringStore(["hello", "world"])) > ``` | Name | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | -| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | +| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx index 6deab926d..39ee8e48a 100644 --- a/website/docs/usage/101/_vectors-similarity.mdx +++ b/website/docs/usage/101/_vectors-similarity.mdx @@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, To make them compact and fast, spaCy's small [pipeline packages](/models) (all -packages that end in `sm`) **don't ship with word vectors**, and only include -context-sensitive **tensors**. This means you can still use the `similarity()` -methods to compare documents, spans and tokens – but the result won't be as -good, and individual tokens won't have any vectors assigned. So in order to use -_real_ word vectors, you need to download a larger pipeline package: +packages that end in `sm`) **don't ship with word vectors**. In order to use +`similarity()`, you need to download a larger pipeline package that includes +vectors: ```diff - python -m spacy download en_core_web_sm -+ python -m spacy download en_core_web_lg ++ python -m spacy download en_core_web_md ``` +In spaCy v3 and earlier, small pipeline packages supported `similarity()` by +backing off to context-sensitive tensors from the `tok2vec` component. These +tensors do not work well for this purpose and this backoff has been removed in +spaCy v4. + Pipeline packages that come with built-in word vectors make them available as diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 0c2bd7a66..792ec119a 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} @@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline") The saved pipeline now includes the `"entity_ruler"` in its [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a -file `entityruler.jsonl` with the patterns. When you load the pipeline back in, -all pipeline components will be restored and deserialized – including the entity +file `patterns.jsonl` with the patterns. When you load the pipeline back in, all +pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful pipeline packages with binary weights _and_ rules included! diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index 1d3682af4..c372744de 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -58,12 +58,12 @@ arcs. -| Argument | Description | -| --------- | ----------------------------------------------------------------------------------------- | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| Argument | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options).