mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 17:41:59 +03:00
Merge remote-tracking branch 'upstream/v4' into feature/redo-lex-attr-getters
This commit is contained in:
commit
d3f7dcb3e3
5
.github/azure-steps.yml
vendored
5
.github/azure-steps.yml
vendored
|
@ -69,6 +69,11 @@ steps:
|
|||
# displayName: 'Test skip re-download (#12188)'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
# - script: |
|
||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
# displayName: 'Test download_url in info CLI'
|
||||
# condition: eq(variables['python_version'] '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
|
|
2
.github/workflows/autoblack.yml
vendored
2
.github/workflows/autoblack.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black
|
||||
- run: pip install black -c requirements.txt
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
|
|
|
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
|||
Python modules. If you've built spaCy from source, you'll already have both
|
||||
tools installed.
|
||||
|
||||
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||
One exception are calls to Python's `logging` functionality.
|
||||
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||
templates with `%s` and `%d` etc.
|
||||
|
||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
inputs:
|
||||
versionSpec: "3.8"
|
||||
- script: |
|
||||
pip install black==22.3.0
|
||||
pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
displayName: "black"
|
||||
- script: |
|
||||
|
|
|
@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64"
|
||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black>=22.0,<23.0
|
||||
black==22.3.0
|
||||
|
|
|
@ -90,9 +90,9 @@ def parse_config_overrides(
|
|||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
logger.debug("Config overrides from CLI: %s", keys)
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
import importlib.metadata
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from .download import get_model_filename, get_latest_version
|
||||
|
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
|||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
dist = importlib.metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
return data["url"]
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
|
|
|
@ -21,7 +21,6 @@ def init_vectors_cli(
|
|||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
# fmt: on
|
||||
|
@ -44,7 +43,6 @@ def init_vectors_cli(
|
|||
vectors_loc,
|
||||
truncate=truncate,
|
||||
prune=prune,
|
||||
name=name,
|
||||
mode=mode,
|
||||
)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
|
|
|
@ -252,7 +252,7 @@ def get_third_party_dependencies(
|
|||
raise regerr from None
|
||||
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||
if module_name: # the code is part of a module, not a --code file
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||
dependencies = []
|
||||
for module_name in modules:
|
||||
if module_name in distributions:
|
||||
|
|
|
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
logger.debug(f"CMD: {cmd['name']}.")
|
||||
logger.debug("CMD: %s.", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
logger.debug(
|
||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
||||
"URL: %s for %s with command hash %s",
|
||||
url,
|
||||
output_path,
|
||||
cmd_hash,
|
||||
)
|
||||
yield url, output_path
|
||||
|
||||
|
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
commands.pop(i)
|
||||
break
|
||||
else:
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
|||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
logger.debug(f"CMD: cmd['name']")
|
||||
logger.debug("CMD: %s", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||
continue
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||
|
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
|
|||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
logger.debug(
|
||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||
)
|
||||
yield output_path, url
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
|
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
|
|
@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"ignoring the duplicate entry.")
|
||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||
"incorrect. Modify PhraseMatcher._terminal_hash to fix.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W026 = ("Unable to set all sentence boundaries from dependency parses. If "
|
||||
"you are constructing a parse tree incrementally by setting "
|
||||
|
@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
|
||||
# v4 warning strings
|
||||
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||
W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability "
|
||||
"lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure "
|
||||
"to return `True` in `.supports_prior_probs`.")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -437,8 +441,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -951,7 +954,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
|
||||
# v4 error strings
|
||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||
|
@ -961,6 +964,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||
"reference and predicted docs.")
|
||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||
|
||||
|
||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||
from .candidate import Candidate, InMemoryCandidate
|
||||
|
||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
from .kb cimport KnowledgeBase
|
||||
from libcpp.vector cimport vector
|
||||
from .kb_in_memory cimport InMemoryLookupKB
|
||||
from ..typedefs cimport hash_t
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
cdef float entity_freq
|
||||
cdef vector[float] entity_vector
|
||||
cdef hash_t alias_hash
|
||||
cdef float prior_prob
|
||||
pass
|
||||
|
||||
|
||||
cdef class InMemoryCandidate(Candidate):
|
||||
cdef readonly hash_t _entity_hash
|
||||
cdef readonly hash_t _alias_hash
|
||||
cpdef vector[float] _entity_vector
|
||||
cdef float _prior_prob
|
||||
cdef readonly InMemoryLookupKB _kb
|
||||
cdef float _entity_freq
|
||||
|
|
|
@ -1,74 +1,96 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from typing import Iterable
|
||||
from .kb cimport KnowledgeBase
|
||||
from ..tokens import Span
|
||||
from .kb_in_memory cimport InMemoryLookupKB
|
||||
from ..errors import Errors
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
"""A `Candidate` object refers to a textual mention that may or may not be resolved
|
||||
to a specific entity from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base,
|
||||
is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
self.entity_vector = entity_vector
|
||||
self.alias_hash = alias_hash
|
||||
self.prior_prob = prior_prob
|
||||
def __init__(self):
|
||||
# Make sure abstract Candidate is not instantiated.
|
||||
if self.__class__ == Candidate:
|
||||
raise TypeError(
|
||||
Errors.E1046.format(cls_name=self.__class__.__name__)
|
||||
)
|
||||
|
||||
@property
|
||||
def entity(self) -> int:
|
||||
"""RETURNS (uint64): hash of the entity's KB ID/name"""
|
||||
return self.entity_hash
|
||||
def entity_id(self) -> int:
|
||||
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
|
||||
otherwise the hash of the entity ID string)."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def entity_(self) -> str:
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
def entity_id_(self) -> str:
|
||||
"""RETURNS (str): String representation of entity ID."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def alias(self) -> int:
|
||||
"""RETURNS (uint64): hash of the alias"""
|
||||
return self.alias_hash
|
||||
def entity_vector(self) -> vector[float]:
|
||||
"""RETURNS (vector[float]): Entity vector."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
cdef class InMemoryCandidate(Candidate):
|
||||
"""Candidate for InMemoryLookupKB."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kb: InMemoryLookupKB,
|
||||
entity_hash: int,
|
||||
alias_hash: int,
|
||||
entity_vector: vector[float],
|
||||
prior_prob: float,
|
||||
entity_freq: float
|
||||
):
|
||||
"""
|
||||
kb (InMemoryLookupKB]): InMemoryLookupKB instance.
|
||||
entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
|
||||
entity_freq (int): Entity frequency in KB corpus.
|
||||
entity_vector (List[float]): Entity embedding.
|
||||
alias_hash (int): Alias hash.
|
||||
prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of
|
||||
the context, this alias - which matches one of this entity's aliases - resolves to one this entity.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self._entity_hash = entity_hash
|
||||
self._entity_vector = entity_vector
|
||||
self._prior_prob = prior_prob
|
||||
self._kb = kb
|
||||
self._alias_hash = alias_hash
|
||||
self._entity_freq = entity_freq
|
||||
|
||||
@property
|
||||
def alias_(self) -> str:
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
def entity_id(self) -> int:
|
||||
return self._entity_hash
|
||||
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
return self.entity_freq
|
||||
|
||||
@property
|
||||
def entity_vector(self) -> Iterable[float]:
|
||||
return self.entity_vector
|
||||
def entity_vector(self) -> vector[float]:
|
||||
return self._entity_vector
|
||||
|
||||
@property
|
||||
def prior_prob(self) -> float:
|
||||
return self.prior_prob
|
||||
"""RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to
|
||||
this entity."""
|
||||
return self._prior_prob
|
||||
|
||||
@property
|
||||
def alias(self) -> str:
|
||||
"""RETURNS (str): Alias."""
|
||||
return self._kb.vocab.strings[self._alias_hash]
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
@property
|
||||
def entity_id_(self) -> str:
|
||||
return self._kb.vocab.strings[self._entity_hash]
|
||||
|
||||
|
||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
||||
@property
|
||||
def entity_freq(self) -> float:
|
||||
"""RETURNS (float): Entity frequency in KB corpus."""
|
||||
return self._entity_freq
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .candidate import Candidate
|
||||
from ..tokens import Span
|
||||
from ..tokens import Span, SpanGroup
|
||||
from ..util import SimpleFrozenList
|
||||
from ..errors import Errors
|
||||
|
||||
|
@ -30,21 +30,23 @@ cdef class KnowledgeBase:
|
|||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
|
||||
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||
probability of the specified mention text resolving to that entity - might be included.
|
||||
If no candidates are found for a given mention, an empty list is returned.
|
||||
mentions (SpanGroup): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return [self.get_candidates(span) for span in mentions]
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
Return candidate entities for a specific mention. Each candidate defines at least the entity and the
|
||||
entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
|
||||
probability of the specified mention text resolving to that entity - might be included.
|
||||
If no candidate is found for the given mention, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
|
@ -106,3 +108,10 @@ cdef class KnowledgeBase:
|
|||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||
)
|
||||
|
||||
@property
|
||||
def supports_prior_probs(self) -> bool:
|
||||
"""RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions."""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__)
|
||||
)
|
||||
|
|
|
@ -18,7 +18,7 @@ from .. import util
|
|||
from ..util import SimpleFrozenList, ensure_path
|
||||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
from .candidate import Candidate as Candidate
|
||||
from .candidate import InMemoryCandidate
|
||||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
def is_empty(self):
|
||||
return len(self) == 0
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
@ -223,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_entry.probs = probs
|
||||
self._aliases_table[alias_index] = alias_entry
|
||||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
return self.get_alias_candidates(mention.text) # type: ignore
|
||||
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
|
||||
return self._get_alias_candidates(mention.text) # type: ignore
|
||||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
|
@ -238,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
|
||||
return [Candidate(kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0]
|
||||
return [
|
||||
InMemoryCandidate(
|
||||
kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
alias_hash=alias_hash,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
prior_prob=prior_prob,
|
||||
entity_freq=self._entries[entry_index].freq
|
||||
)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
if entry_index != 0
|
||||
]
|
||||
|
||||
def get_vector(self, str entity):
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
@ -276,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
return 0.0
|
||||
|
||||
def supports_prior_probs(self) -> bool:
|
||||
return True
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the current state to a binary string.
|
||||
"""
|
||||
|
|
|
@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class SwedishDefaults(BaseDefaults):
|
||||
|
|
33
spacy/lang/sv/punctuation.py
Normal file
33
spacy/lang/sv/punctuation.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = [
|
||||
suffix
|
||||
for suffix in TOKENIZER_SUFFIXES
|
||||
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||
]
|
||||
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -107,7 +107,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
@ -175,8 +175,7 @@ class Language:
|
|||
if not isinstance(vocab, Vocab) and vocab is not True:
|
||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||
vocab = create_vocab(self.lang, self.Defaults)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -230,7 +229,6 @@ class Language:
|
|||
"width": self.vocab.vectors_length,
|
||||
"vectors": len(self.vocab.vectors),
|
||||
"keys": self.vocab.vectors.n_keys,
|
||||
"name": self.vocab.vectors.name,
|
||||
"mode": self.vocab.vectors.mode,
|
||||
}
|
||||
self._meta["labels"] = dict(self.pipe_labels)
|
||||
|
@ -1205,7 +1203,7 @@ class Language:
|
|||
_: Optional[Any] = None,
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
sgd: Union[Optimizer, None, Literal[False]] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
|
@ -1216,7 +1214,9 @@ class Language:
|
|||
examples (Iterable[Example]): A batch of examples
|
||||
_: Should not be set - serves to catch backwards-incompatible scripts.
|
||||
drop (float): The dropout rate.
|
||||
sgd (Optimizer): An optimizer.
|
||||
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
|
||||
be created via create_optimizer if 'None'. No optimizer will
|
||||
be used when set to 'False'.
|
||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
||||
component.
|
||||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||
|
@ -1249,17 +1249,12 @@ class Language:
|
|||
component_cfg[name].setdefault("drop", drop)
|
||||
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||||
for name, proc in self.pipeline:
|
||||
# ignore statements are used here because mypy ignores hasattr
|
||||
if name not in exclude and hasattr(proc, "update"):
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore
|
||||
if sgd not in (None, False):
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.finish_update(sgd)
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
):
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||
if name in annotates:
|
||||
for doc, eg in zip(
|
||||
_pipe(
|
||||
|
@ -1272,6 +1267,18 @@ class Language:
|
|||
examples,
|
||||
):
|
||||
eg.predicted = doc
|
||||
# Only finish the update after all component updates are done. Some
|
||||
# components may share weights (such as tok2vec) and we only want
|
||||
# to apply weight updates after all gradients are accumulated.
|
||||
for name, proc in self.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
and sgd not in (None, False)
|
||||
):
|
||||
proc.finish_update(sgd)
|
||||
|
||||
return losses
|
||||
|
||||
def rehearse(
|
||||
|
@ -2069,7 +2076,7 @@ class Language:
|
|||
pipe = self.get_pipe(pipe_name)
|
||||
pipe_cfg = self._pipe_configs[pipe_name]
|
||||
if listeners:
|
||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
||||
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||||
if len(list(listeners)) != len(pipe_listeners):
|
||||
# The number of listeners defined in the component model doesn't
|
||||
# match the listeners to replace, so we won't be able to update
|
||||
|
@ -2192,9 +2199,6 @@ class Language:
|
|||
if path.exists():
|
||||
data = srsly.read_json(path)
|
||||
self.meta.update(data)
|
||||
# self.meta always overrides meta["vectors"] with the metadata
|
||||
# from self.vocab.vectors, so set the name directly
|
||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
||||
|
||||
def deserialize_vocab(path: Path) -> None:
|
||||
if path.exists():
|
||||
|
@ -2263,9 +2267,6 @@ class Language:
|
|||
def deserialize_meta(b):
|
||||
data = srsly.json_loads(b)
|
||||
self.meta.update(data)
|
||||
# self.meta always overrides meta["vectors"] with the metadata
|
||||
# from self.vocab.vectors, so set the name directly
|
||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
||||
|
||||
deserializers: Dict[str, Callable[[bytes], Any]] = {}
|
||||
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
|
||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
|||
"$-": self._imm_left_sib,
|
||||
"$++": self._right_sib,
|
||||
"$--": self._left_sib,
|
||||
">+": self._imm_right_child,
|
||||
">-": self._imm_left_child,
|
||||
">++": self._right_child,
|
||||
">--": self._left_child,
|
||||
"<+": self._imm_right_parent,
|
||||
"<-": self._imm_left_parent,
|
||||
"<++": self._right_parent,
|
||||
"<--": self._left_parent,
|
||||
}
|
||||
|
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
|
|||
def _left_sib(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _imm_right_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _imm_left_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _right_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||
|
||||
def _imm_right_parent(self, doc, node):
|
||||
if doc[node].head.i == node + 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _imm_left_parent(self, doc, node):
|
||||
if doc[node].head.i == node - 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _right_parent(self, doc, node):
|
||||
if doc[node].head.i > node:
|
||||
return [doc[node].head]
|
||||
|
|
|
@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
|
|||
return attr_values
|
||||
|
||||
|
||||
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
|
||||
# tuple order affects performance
|
||||
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
|
||||
|
||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||
# extensions to the matcher introduced in #3173.
|
||||
|
||||
|
@ -848,7 +853,7 @@ class _FuzzyPredicate:
|
|||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
|
@ -870,7 +875,7 @@ class _RegexPredicate:
|
|||
self.value = re.compile(value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -906,7 +911,7 @@ class _SetPredicate:
|
|||
self.value = set(get_string_id(v) for v in value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -978,7 +983,7 @@ class _ComparisonPredicate:
|
|||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
|||
if isinstance(value, dict):
|
||||
for type_, cls in predicate_types.items():
|
||||
if type_ in value:
|
||||
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
|
||||
key = _predicate_cache_key(attr, type_, value[type_])
|
||||
if key in seen_predicates:
|
||||
output.append(seen_predicates[key])
|
||||
else:
|
||||
|
|
|
@ -6,9 +6,9 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged
|
|||
|
||||
from ...util import registry
|
||||
from ...kb import KnowledgeBase, InMemoryLookupKB
|
||||
from ...kb import Candidate, get_candidates, get_candidates_batch
|
||||
from ...kb import Candidate
|
||||
from ...vocab import Vocab
|
||||
from ...tokens import Span, Doc
|
||||
from ...tokens import Doc, Span, SpanGroup
|
||||
from ..extract_spans import extract_spans
|
||||
from ...errors import Errors
|
||||
|
||||
|
@ -89,6 +89,14 @@ def load_kb(
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v2")
|
||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
|
@ -106,6 +114,28 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
|||
|
||||
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||
def create_candidates_batch() -> Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||
]:
|
||||
return get_candidates_batch
|
||||
|
||||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates(mention)
|
||||
|
||||
|
||||
def get_candidates_batch(
|
||||
kb: KnowledgeBase, mentions: SpanGroup
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mentions (SpanGroup): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
"""
|
||||
return kb.get_candidates_batch(mentions)
|
||||
|
|
|
@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
|||
cdef np.ndarray step_actions
|
||||
|
||||
scores = []
|
||||
while sizes.states >= 1:
|
||||
while sizes.states >= 1 and (actions is None or len(actions) > 0):
|
||||
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
|
||||
step_actions = actions[0] if actions is not None else None
|
||||
assert step_actions is None or step_actions.size == sizes.states, \
|
||||
f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
|
||||
with nogil:
|
||||
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
|
||||
if actions is None:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any
|
||||
from typing import cast
|
||||
import warnings
|
||||
from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast
|
||||
from numpy import dtype
|
||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||
from pathlib import Path
|
||||
|
@ -10,14 +10,15 @@ from thinc.api import CosineDistance, Model, Optimizer, Config
|
|||
from thinc.api import set_dropout_rate
|
||||
|
||||
from ..kb import KnowledgeBase, Candidate
|
||||
from ..ml import empty_kb
|
||||
from ..tokens import Doc, Span
|
||||
from ..ml import empty_kb
|
||||
from ..tokens import Doc, Span, SpanGroup
|
||||
from .pipe import deserialize_config
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ..language import Language
|
||||
from ..vocab import Vocab
|
||||
from ..training import Example, validate_examples, validate_get_examples
|
||||
from ..errors import Errors
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList, registry
|
||||
from .. import util
|
||||
from ..scorer import Scorer
|
||||
|
@ -27,9 +28,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
|
|||
|
||||
KNOWLEDGE_BASE_IDS = "kb_ids"
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
|
@ -60,7 +58,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||
"overwrite": True,
|
||||
"overwrite": False,
|
||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
"candidates_batch_size": 1,
|
||||
|
@ -85,8 +84,9 @@ def make_entity_linker(
|
|||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
|
@ -107,8 +107,9 @@ def make_entity_linker(
|
|||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -117,28 +118,9 @@ def make_entity_linker(
|
|||
prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
try:
|
||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
||||
except:
|
||||
raise ImportError(
|
||||
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
|
||||
)
|
||||
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
||||
return EntityLinker_v1(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
labels_discard=labels_discard,
|
||||
n_sents=n_sents,
|
||||
incl_prior=incl_prior,
|
||||
incl_context=incl_context,
|
||||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
)
|
||||
raise ValueError(Errors.E4005)
|
||||
|
||||
return EntityLinker(
|
||||
nlp.vocab,
|
||||
model,
|
||||
|
@ -150,6 +132,7 @@ def make_entity_linker(
|
|||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
get_candidates_batch=get_candidates_batch,
|
||||
generate_empty_kb=generate_empty_kb,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
|
@ -189,9 +172,10 @@ class EntityLinker(TrainablePipe):
|
|||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
candidates_batch_size: int,
|
||||
|
@ -212,15 +196,18 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
overwrite (bool): Whether to overwrite existing non-empty annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
|
||||
|
@ -237,6 +224,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
# how many neighbour sentences to take into account
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
|
@ -244,9 +232,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.get_candidates_batch = get_candidates_batch
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
|
@ -255,6 +241,8 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
if candidates_batch_size < 1:
|
||||
raise ValueError(Errors.E1044)
|
||||
if self.incl_prior and not self.kb.supports_prior_probs:
|
||||
warnings.warn(Warnings.W401)
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
|
@ -268,7 +256,7 @@ class EntityLinker(TrainablePipe):
|
|||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
|
@ -487,7 +475,8 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
batch_candidates = list(
|
||||
self.get_candidates_batch(
|
||||
self.kb, [ent_batch[idx] for idx in valid_ent_idx]
|
||||
self.kb,
|
||||
SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
|
||||
)
|
||||
if self.candidates_batch_size > 1
|
||||
else [
|
||||
|
@ -537,18 +526,19 @@ class EntityLinker(TrainablePipe):
|
|||
)
|
||||
elif len(candidates) == 1 and self.threshold is None:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
final_kb_ids.append(candidates[0].entity_id_)
|
||||
self._add_activations(
|
||||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
scores=[1.0],
|
||||
ents=[candidates[0].entity_],
|
||||
ents=[candidates[0].entity_id],
|
||||
)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
if self.incl_prior and self.kb.supports_prior_probs:
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore
|
||||
else:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
|
@ -572,7 +562,7 @@ class EntityLinker(TrainablePipe):
|
|||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
final_kb_ids.append(
|
||||
candidates[scores.argmax().item()].entity_
|
||||
candidates[scores.argmax().item()].entity_id_
|
||||
if self.threshold is None
|
||||
or scores.max() >= self.threshold
|
||||
else EntityLinker.NIL
|
||||
|
@ -581,7 +571,7 @@ class EntityLinker(TrainablePipe):
|
|||
doc_scores=doc_scores,
|
||||
doc_ents=doc_ents,
|
||||
scores=scores,
|
||||
ents=[c.entity for c in candidates],
|
||||
ents=[c.entity_id for c in candidates],
|
||||
)
|
||||
self._add_doc_activations(
|
||||
docs_scores=docs_scores,
|
||||
|
|
|
@ -21,10 +21,6 @@ from ..scorer import Scorer
|
|||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
BACKWARD_EXTEND = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -102,8 +98,8 @@ class Morphologizer(Tagger):
|
|||
model: Model,
|
||||
name: str = "morphologizer",
|
||||
*,
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
overwrite: bool = False,
|
||||
extend: bool = False,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
|
@ -113,6 +109,8 @@ class Morphologizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
extend (bool): Whether to extend existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
|
|
|
@ -10,9 +10,6 @@ from ..language import Language
|
|||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
|
||||
# see #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
@Language.factory(
|
||||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
|
@ -52,13 +49,14 @@ class Sentencizer(Pipe):
|
|||
name="sentencizer",
|
||||
*,
|
||||
punct_chars=None,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
overwrite=False,
|
||||
scorer=senter_score,
|
||||
):
|
||||
"""Initialize the sentencizer.
|
||||
|
||||
punct_chars (list): Punctuation characters to split on. Will be
|
||||
serialized with the nlp object.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples
|
|||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
|
@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger):
|
|||
model,
|
||||
name="senter",
|
||||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
overwrite=False,
|
||||
scorer=senter_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
|
@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
|
|
@ -27,9 +27,6 @@ from .. import util
|
|||
|
||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -99,7 +96,7 @@ class Tagger(TrainablePipe):
|
|||
model,
|
||||
name="tagger",
|
||||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
overwrite=False,
|
||||
scorer=tagger_score,
|
||||
neg_prefix="!",
|
||||
save_activations: bool = False,
|
||||
|
@ -110,6 +107,7 @@ class Tagger(TrainablePipe):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attribute "tag".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any
|
||||
from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
|
||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||
from thinc.types import Floats2d
|
||||
from itertools import islice
|
||||
|
||||
from .trainable_pipe import TrainablePipe
|
||||
|
@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/tok2vec#update
|
||||
"""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
validate_examples(examples, "Tok2Vec.update")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
set_dropout_rate(self.model, drop)
|
||||
tokvecs, bp_tokvecs = self.model.begin_update(docs)
|
||||
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
||||
def accumulate_gradient(one_d_tokvecs):
|
||||
"""Accumulate tok2vec loss and gradient. This is passed as a callback
|
||||
to all but the last listener. Only the last one does the backprop.
|
||||
"""
|
||||
nonlocal d_tokvecs
|
||||
for i in range(len(one_d_tokvecs)):
|
||||
d_tokvecs[i] += one_d_tokvecs[i]
|
||||
losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
|
||||
return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
||||
|
||||
def backprop(one_d_tokvecs):
|
||||
"""Callback to actually do the backprop. Passed to last listener."""
|
||||
accumulate_gradient(one_d_tokvecs)
|
||||
d_docs = bp_tokvecs(d_tokvecs)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
return d_docs
|
||||
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners[:-1]:
|
||||
listener.receive(batch_id, tokvecs, accumulate_gradient)
|
||||
if self.listeners:
|
||||
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
||||
return losses
|
||||
return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
|
||||
|
||||
def get_loss(self, examples, scores) -> None:
|
||||
pass
|
||||
|
@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe):
|
|||
def add_label(self, label):
|
||||
raise NotImplementedError
|
||||
|
||||
def distill(
|
||||
self,
|
||||
teacher_pipe: Optional["TrainablePipe"],
|
||||
examples: Iterable["Example"],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
) -> Dict[str, float]:
|
||||
"""Performs an update of the student pipe's model using the
|
||||
student's distillation examples and sets the annotations
|
||||
of the teacher's distillation examples using the teacher pipe.
|
||||
|
||||
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
|
||||
for prediction.
|
||||
examples (Iterable[Example]): Distillation examples. The reference (teacher)
|
||||
and predicted (student) docs must have the same number of tokens and the
|
||||
same orthography.
|
||||
drop (float): dropout rate.
|
||||
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||
create_optimizer if not set.
|
||||
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||
distillation.
|
||||
RETURNS: The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#distill
|
||||
"""
|
||||
# By default we require a teacher pipe, but there are downstream
|
||||
# implementations that don't require a pipe.
|
||||
if teacher_pipe is None:
|
||||
raise ValueError(Errors.E4002.format(name=self.name))
|
||||
teacher_docs = [eg.reference for eg in examples]
|
||||
student_docs = [eg.predicted for eg in examples]
|
||||
teacher_preds = teacher_pipe.predict(teacher_docs)
|
||||
teacher_pipe.set_annotations(teacher_docs, teacher_preds)
|
||||
return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
|
||||
|
||||
def _update_with_docs(
|
||||
self,
|
||||
docs: Iterable[Doc],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
):
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
set_dropout_rate(self.model, drop)
|
||||
|
||||
tokvecs, accumulate_gradient, backprop = self._create_backprops(
|
||||
docs, losses, sgd=sgd
|
||||
)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners[:-1]:
|
||||
listener.receive(batch_id, tokvecs, accumulate_gradient)
|
||||
if self.listeners:
|
||||
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
||||
return losses
|
||||
|
||||
def _create_backprops(
|
||||
self,
|
||||
docs: Iterable[Doc],
|
||||
losses: Dict[str, float],
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
) -> Tuple[Floats2d, Callable, Callable]:
|
||||
tokvecs, bp_tokvecs = self.model.begin_update(docs)
|
||||
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
||||
|
||||
def accumulate_gradient(one_d_tokvecs):
|
||||
"""Accumulate tok2vec loss and gradient. This is passed as a callback
|
||||
to all but the last listener. Only the last one does the backprop.
|
||||
"""
|
||||
nonlocal d_tokvecs
|
||||
for i in range(len(one_d_tokvecs)):
|
||||
d_tokvecs[i] += one_d_tokvecs[i]
|
||||
losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
|
||||
return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
||||
|
||||
def backprop(one_d_tokvecs):
|
||||
"""Callback to actually do the backprop. Passed to last listener."""
|
||||
accumulate_gradient(one_d_tokvecs)
|
||||
d_docs = bp_tokvecs(d_tokvecs)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
return d_docs
|
||||
|
||||
return tokvecs, accumulate_gradient, backprop
|
||||
|
||||
|
||||
class Tok2VecListener(Model):
|
||||
"""A layer that gets fed its answers from an upstream connection,
|
||||
|
|
|
@ -36,6 +36,11 @@ from ..errors import Errors, Warnings
|
|||
from .. import util
|
||||
|
||||
|
||||
# TODO: Remove when we switch to Cython 3.
|
||||
cdef extern from "<algorithm>" namespace "std" nogil:
|
||||
bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
|
||||
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
|
||||
|
@ -253,8 +258,8 @@ class Parser(TrainablePipe):
|
|||
# batch uniform length. Since we do not have a gold standard
|
||||
# sequence, we use the teacher's predictions as the gold
|
||||
# standard.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states = self._init_batch(teacher_pipe, student_docs, max_moves)
|
||||
max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
|
||||
states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
|
||||
else:
|
||||
states = self.moves.init_batch(student_docs)
|
||||
|
||||
|
@ -265,12 +270,12 @@ class Parser(TrainablePipe):
|
|||
# gradients of the student's transition distributions relative to the
|
||||
# teacher's distributions.
|
||||
|
||||
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
|
||||
max_moves=max_moves)
|
||||
student_inputs = TransitionModelInputs(docs=student_docs,
|
||||
states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
actions = _states_diff_to_actions(states, student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
|
||||
moves=self.moves, actions=actions)
|
||||
states=states, moves=teacher_pipe.moves, actions=actions)
|
||||
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||
|
@ -522,7 +527,7 @@ class Parser(TrainablePipe):
|
|||
set_dropout_rate(self.model, 0.0)
|
||||
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
actions = _states_to_actions(student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
|
||||
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
|
||||
|
||||
|
@ -642,7 +647,7 @@ class Parser(TrainablePipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
return self
|
||||
|
||||
def _init_batch(self, teacher_step_model, docs, max_length):
|
||||
def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
|
||||
"""Make a square batch of length equal to the shortest transition
|
||||
sequence or a cap. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
|
@ -651,10 +656,12 @@ class Parser(TrainablePipe):
|
|||
_init_gold_batch, this version uses a teacher model to generate the
|
||||
cut sequences."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch(docs)
|
||||
TransitionSystem moves = teacher_pipe.moves
|
||||
|
||||
# Start with the same heuristic as in supervised training: exclude
|
||||
# docs that are within the maximum length.
|
||||
all_states = moves.init_batch(docs)
|
||||
states = []
|
||||
to_cut = []
|
||||
for state, doc in zip(all_states, docs):
|
||||
|
@ -663,18 +670,28 @@ class Parser(TrainablePipe):
|
|||
states.append(state)
|
||||
else:
|
||||
to_cut.append(state)
|
||||
|
||||
if not to_cut:
|
||||
return states
|
||||
|
||||
# Parse the states that are too long with the teacher's parsing model.
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
|
||||
states=[state.copy() for state in to_cut])
|
||||
(teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
# Step through the teacher's actions and store every state after
|
||||
# each multiple of max_length.
|
||||
teacher_actions = _states_to_actions(teacher_states)
|
||||
while to_cut:
|
||||
states.extend(state.copy() for state in to_cut)
|
||||
# Move states forward max_length actions.
|
||||
length = 0
|
||||
while to_cut and length < max_length:
|
||||
teacher_scores = teacher_step_model.predict(to_cut)
|
||||
self.transition_states(to_cut, teacher_scores)
|
||||
# States that are completed do not need further cutting.
|
||||
to_cut = [state for state in to_cut if not state.is_final()]
|
||||
length += 1
|
||||
return states
|
||||
for step_actions in teacher_actions[:max_length]:
|
||||
to_cut = moves.apply_actions(to_cut, step_actions)
|
||||
teacher_actions = teacher_actions[max_length:]
|
||||
|
||||
if len(teacher_actions) < max_length:
|
||||
break
|
||||
|
||||
return states
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
|
@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs):
|
|||
model.attrs[key] = value
|
||||
|
||||
|
||||
def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
||||
def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
|
||||
cdef int step
|
||||
cdef StateClass state
|
||||
cdef StateC* c_state
|
||||
|
@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
|||
actions.append(numpy.array(step_actions, dtype="i"))
|
||||
|
||||
return actions
|
||||
|
||||
def _states_diff_to_actions(
|
||||
before_states: List[StateClass],
|
||||
after_states: List[StateClass]
|
||||
) -> List[Ints1d]:
|
||||
"""
|
||||
Return for two sets of states the actions to go from the first set of
|
||||
states to the second set of states. The histories of the first set of
|
||||
states must be a prefix of the second set of states.
|
||||
"""
|
||||
cdef StateClass before_state, after_state
|
||||
cdef StateC* c_state_before
|
||||
cdef StateC* c_state_after
|
||||
|
||||
assert len(before_states) == len(after_states)
|
||||
|
||||
# Check invariant: before states histories must be prefixes of after states.
|
||||
for before_state, after_state in zip(before_states, after_states):
|
||||
c_state_before = before_state.c
|
||||
c_state_after = after_state.c
|
||||
|
||||
assert equal(c_state_before.history.begin(), c_state_before.history.end(),
|
||||
c_state_after.history.begin())
|
||||
|
||||
actions = []
|
||||
while True:
|
||||
step = len(actions)
|
||||
|
||||
step_actions = []
|
||||
for before_state, after_state in zip(before_states, after_states):
|
||||
c_state_before = before_state.c
|
||||
c_state_after = after_state.c
|
||||
if step < c_state_after.history.size() - c_state_before.history.size():
|
||||
step_actions.append(c_state_after.history[c_state_before.history.size() + step])
|
||||
|
||||
# We are done if we have exhausted all histories.
|
||||
if len(step_actions) == 0:
|
||||
break
|
||||
|
||||
actions.append(numpy.array(step_actions, dtype="i"))
|
||||
|
||||
return actions
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo
|
|||
from pathlib import Path
|
||||
|
||||
class StringStore:
|
||||
def __init__(self, strings: Optional[Iterable[str]]) -> None: ...
|
||||
def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ...
|
||||
@overload
|
||||
def __getitem__(self, string_or_hash: str) -> int: ...
|
||||
@overload
|
||||
|
|
|
@ -175,6 +175,18 @@ def test_modify_span_group(doc):
|
|||
assert group[0].label == doc.vocab.strings["TEST"]
|
||||
|
||||
|
||||
def test_char_span_attributes(doc):
|
||||
label = "LABEL"
|
||||
kb_id = "KB_ID"
|
||||
span_id = "SPAN_ID"
|
||||
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
|
||||
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
|
||||
assert span1.text == span2.text
|
||||
assert span1.label_ == span2.label_ == label
|
||||
assert span1.kb_id_ == span2.kb_id_ == kb_id
|
||||
assert span1.id_ == span2.id_ == span_id
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
|
|||
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||
)
|
||||
|
||||
# Span.char_span + alignment mode "contract"
|
||||
span2 = doc[0:2].char_span(
|
||||
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||
)
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
|
|
|
@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
|||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.issue(12311)
|
||||
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
|
||||
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
|||
("the", "brown", "$--", 0),
|
||||
("brown", "the", "$--", 1),
|
||||
("brown", "brown", "$--", 0),
|
||||
("over", "jumped", "<+", 0),
|
||||
("quick", "fox", "<+", 0),
|
||||
("the", "quick", "<+", 0),
|
||||
("brown", "fox", "<+", 1),
|
||||
("quick", "fox", "<++", 1),
|
||||
("quick", "over", "<++", 0),
|
||||
("over", "jumped", "<++", 0),
|
||||
("the", "fox", "<++", 2),
|
||||
("brown", "fox", "<-", 0),
|
||||
("fox", "over", "<-", 0),
|
||||
("the", "over", "<-", 0),
|
||||
("over", "jumped", "<-", 1),
|
||||
("brown", "fox", "<--", 0),
|
||||
("fox", "jumped", "<--", 0),
|
||||
("fox", "over", "<--", 1),
|
||||
("fox", "brown", ">+", 0),
|
||||
("over", "fox", ">+", 0),
|
||||
("over", "the", ">+", 0),
|
||||
("jumped", "over", ">+", 1),
|
||||
("jumped", "over", ">++", 1),
|
||||
("fox", "lazy", ">++", 0),
|
||||
("over", "the", ">++", 0),
|
||||
("jumped", "over", ">-", 0),
|
||||
("fox", "quick", ">-", 0),
|
||||
("brown", "quick", ">-", 0),
|
||||
("fox", "brown", ">-", 1),
|
||||
("brown", "fox", ">--", 0),
|
||||
("fox", "brown", ">--", 1),
|
||||
("jumped", "fox", ">--", 1),
|
||||
|
|
61
spacy/tests/parser/test_model.py
Normal file
61
spacy/tests/parser/test_model.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import numpy
|
||||
import pytest
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml.tb_framework import TransitionModelInputs
|
||||
from spacy.training import Example
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"They trade mortgage-backed securities.",
|
||||
{
|
||||
"heads": [1, 1, 4, 4, 5, 1, 1],
|
||||
"deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"I like London and Berlin.",
|
||||
{
|
||||
"heads": [1, 1, 1, 2, 2, 1],
|
||||
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp_parser():
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe("parser")
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations["deps"]:
|
||||
parser.add_label(dep)
|
||||
nlp.initialize()
|
||||
|
||||
return nlp, parser
|
||||
|
||||
|
||||
def test_incorrect_number_of_actions(nlp_parser):
|
||||
nlp, parser = nlp_parser
|
||||
doc = nlp.make_doc("test")
|
||||
|
||||
# Too many actions for the number of docs
|
||||
with pytest.raises(AssertionError):
|
||||
parser.model.predict(
|
||||
TransitionModelInputs(
|
||||
docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
|
||||
)
|
||||
)
|
||||
|
||||
# Too few actions for the number of docs
|
||||
with pytest.raises(AssertionError):
|
||||
parser.model.predict(
|
||||
TransitionModelInputs(
|
||||
docs=[doc, doc],
|
||||
moves=parser.moves,
|
||||
actions=[numpy.array([0], dtype="i")],
|
||||
)
|
||||
)
|
|
@ -623,7 +623,9 @@ def test_is_distillable():
|
|||
assert ner.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
|
||||
def test_distill(max_moves):
|
||||
teacher = English()
|
||||
teacher_ner = teacher.add_pipe("ner")
|
||||
train_examples = []
|
||||
|
@ -641,6 +643,7 @@ def test_distill():
|
|||
|
||||
student = English()
|
||||
student_ner = student.add_pipe("ner")
|
||||
student_ner.cfg["update_with_oracle_cut_size"] = max_moves
|
||||
student_ner.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_ner.label_data
|
||||
)
|
||||
|
|
|
@ -463,7 +463,9 @@ def test_is_distillable():
|
|||
assert parser.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
|
||||
def test_distill(max_moves):
|
||||
teacher = English()
|
||||
teacher_parser = teacher.add_pipe("parser")
|
||||
train_examples = []
|
||||
|
@ -481,6 +483,7 @@ def test_distill():
|
|||
|
||||
student = English()
|
||||
student_parser = student.add_pipe("parser")
|
||||
student_parser.cfg["update_with_oracle_cut_size"] = max_moves
|
||||
student_parser.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_parser.label_data
|
||||
)
|
||||
|
|
|
@ -54,9 +54,11 @@ def test_annotates_on_update():
|
|||
return AssertSents(name)
|
||||
|
||||
class AssertSents:
|
||||
model = None
|
||||
is_trainable = True
|
||||
|
||||
def __init__(self, name, **cfg):
|
||||
self.name = name
|
||||
pass
|
||||
|
||||
def __call__(self, doc):
|
||||
if not doc.has_annotation("SENT_START"):
|
||||
|
@ -64,10 +66,16 @@ def test_annotates_on_update():
|
|||
return doc
|
||||
|
||||
def update(self, examples, *, drop=0.0, sgd=None, losses=None):
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
||||
for example in examples:
|
||||
if not example.predicted.has_annotation("SENT_START"):
|
||||
raise ValueError("No sents")
|
||||
return {}
|
||||
|
||||
return losses
|
||||
|
||||
def finish_update(self, sgd=None):
|
||||
pass
|
||||
|
||||
nlp = English()
|
||||
nlp.add_pipe("sentencizer")
|
||||
|
|
|
@ -7,10 +7,10 @@ from thinc.types import Ragged
|
|||
from spacy import registry, util
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker
|
||||
from spacy.ml.models.entity_linker import build_span_maker, get_candidates
|
||||
from spacy.pipeline import EntityLinker, TrainablePipe
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.scorer import Scorer
|
||||
|
@ -353,6 +353,9 @@ def test_kb_default(nlp):
|
|||
"""Test that the default (empty) KB is loaded upon construction"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError, match="E139"):
|
||||
# this raises an error because the KB is empty
|
||||
entity_linker.validate_kb()
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
# 64 is the default value from pipeline.entity_linker
|
||||
|
@ -462,16 +465,17 @@ def test_candidate_generation(nlp):
|
|||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
# test the size of the relevant candidates
|
||||
adam_ent_cands = get_candidates(mykb, adam_ent)
|
||||
assert len(get_candidates(mykb, douglas_ent)) == 2
|
||||
assert len(get_candidates(mykb, adam_ent)) == 1
|
||||
assert len(adam_ent_cands) == 1
|
||||
assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive
|
||||
assert len(get_candidates(mykb, shrubbery_ent)) == 0
|
||||
|
||||
# test the content of the candidates
|
||||
assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
|
||||
assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
|
||||
assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
|
||||
assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
|
||||
assert adam_ent_cands[0].entity_id_ == "Q2"
|
||||
assert adam_ent_cands[0].alias == "adam"
|
||||
assert_almost_equal(adam_ent_cands[0].entity_freq, 12)
|
||||
assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9)
|
||||
|
||||
|
||||
def test_el_pipe_configuration(nlp):
|
||||
|
@ -499,7 +503,7 @@ def test_el_pipe_configuration(nlp):
|
|||
assert doc[2].ent_kb_id_ == "Q2"
|
||||
|
||||
def get_lowercased_candidates(kb, span):
|
||||
return kb.get_alias_candidates(span.text.lower())
|
||||
return kb._get_alias_candidates(span.text.lower())
|
||||
|
||||
def get_lowercased_candidates_batch(kb, spans):
|
||||
return [get_lowercased_candidates(kb, span) for span in spans]
|
||||
|
@ -558,24 +562,22 @@ def test_vocab_serialization(nlp):
|
|||
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
|
||||
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
candidates = mykb.get_alias_candidates("adam")
|
||||
candidates = mykb._get_alias_candidates("adam")
|
||||
assert len(candidates) == 1
|
||||
assert candidates[0].entity == q2_hash
|
||||
assert candidates[0].entity_ == "Q2"
|
||||
assert candidates[0].alias == adam_hash
|
||||
assert candidates[0].alias_ == "adam"
|
||||
assert candidates[0].entity_id == q2_hash
|
||||
assert candidates[0].entity_id_ == "Q2"
|
||||
assert candidates[0].alias == "adam"
|
||||
|
||||
with make_tempdir() as d:
|
||||
mykb.to_disk(d / "kb")
|
||||
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
|
||||
kb_new_vocab.from_disk(d / "kb")
|
||||
|
||||
candidates = kb_new_vocab.get_alias_candidates("adam")
|
||||
candidates = kb_new_vocab._get_alias_candidates("adam")
|
||||
assert len(candidates) == 1
|
||||
assert candidates[0].entity == q2_hash
|
||||
assert candidates[0].entity_ == "Q2"
|
||||
assert candidates[0].alias == adam_hash
|
||||
assert candidates[0].alias_ == "adam"
|
||||
assert candidates[0].entity_id == q2_hash
|
||||
assert candidates[0].entity_id_ == "Q2"
|
||||
assert candidates[0].alias == "adam"
|
||||
|
||||
assert kb_new_vocab.get_vector("Q2") == [2]
|
||||
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
|
||||
|
@ -595,20 +597,20 @@ def test_append_alias(nlp):
|
|||
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
|
||||
|
||||
# test the size of the relevant candidates
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 2
|
||||
assert len(mykb._get_alias_candidates("douglas")) == 2
|
||||
|
||||
# append an alias
|
||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
|
||||
|
||||
# test the size of the relevant candidates has been incremented
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||
|
||||
# append the same alias-entity pair again should not work (will throw a warning)
|
||||
with pytest.warns(UserWarning):
|
||||
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
|
||||
|
||||
# test the size of the relevant candidates remained unchanged
|
||||
assert len(mykb.get_alias_candidates("douglas")) == 3
|
||||
assert len(mykb._get_alias_candidates("douglas")) == 3
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||
|
@ -905,11 +907,11 @@ def test_kb_to_bytes():
|
|||
assert kb_2.contains_alias("Russ Cochran")
|
||||
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
|
||||
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
|
||||
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
|
||||
kb_2.get_alias_candidates("Russ Cochran")
|
||||
assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
|
||||
kb_2._get_alias_candidates("Russ Cochran")
|
||||
)
|
||||
assert len(kb_1.get_alias_candidates("Randomness")) == len(
|
||||
kb_2.get_alias_candidates("Randomness")
|
||||
assert len(kb_1._get_alias_candidates("Randomness")) == len(
|
||||
kb_2._get_alias_candidates("Randomness")
|
||||
)
|
||||
|
||||
|
||||
|
@ -990,14 +992,11 @@ def test_scorer_links():
|
|||
@pytest.mark.parametrize(
|
||||
"name,config",
|
||||
[
|
||||
("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
|
||||
],
|
||||
)
|
||||
# fmt: on
|
||||
def test_legacy_architectures(name, config):
|
||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
||||
|
||||
# Ensure that the legacy architectures still work
|
||||
vector_length = 3
|
||||
nlp = English()
|
||||
|
@ -1019,10 +1018,7 @@ def test_legacy_architectures(name, config):
|
|||
return mykb
|
||||
|
||||
entity_linker = nlp.add_pipe(name, config={"model": config})
|
||||
if config["@architectures"] == "spacy.EntityLinker.v1":
|
||||
assert isinstance(entity_linker, EntityLinker_v1)
|
||||
else:
|
||||
assert isinstance(entity_linker, EntityLinker)
|
||||
assert isinstance(entity_linker, EntityLinker)
|
||||
entity_linker.set_kb(create_kb)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ from spacy.lang.en import English
|
|||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import TrainablePipe
|
||||
from spacy.strings import StringStore
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir
|
||||
|
@ -131,7 +132,7 @@ def test_issue5458():
|
|||
# Test that the noun chuncker does not generate overlapping spans
|
||||
# fmt: off
|
||||
words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."]
|
||||
vocab = Vocab(strings=words)
|
||||
vocab = Vocab(strings=StringStore(words))
|
||||
deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"]
|
||||
pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"]
|
||||
heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
|
||||
|
|
|
@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat():
|
|||
assert cats1["imperative"] < 0.9
|
||||
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
|
||||
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
|
||||
|
||||
|
||||
cfg_string_distillation = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec","tagger"]
|
||||
|
||||
[components]
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
rows = [2000, 1000, 1000, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
"""
|
||||
|
||||
|
||||
def test_tok2vec_distillation_teacher_annotations():
|
||||
orig_config = Config().from_str(cfg_string_distillation)
|
||||
teacher_nlp = util.load_model_from_config(
|
||||
orig_config, auto_fill=True, validate=True
|
||||
)
|
||||
student_nlp = util.load_model_from_config(
|
||||
orig_config, auto_fill=True, validate=True
|
||||
)
|
||||
|
||||
train_examples_teacher = []
|
||||
train_examples_student = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples_teacher.append(
|
||||
Example.from_dict(teacher_nlp.make_doc(t[0]), t[1])
|
||||
)
|
||||
train_examples_student.append(
|
||||
Example.from_dict(student_nlp.make_doc(t[0]), t[1])
|
||||
)
|
||||
|
||||
optimizer = teacher_nlp.initialize(lambda: train_examples_teacher)
|
||||
student_nlp.initialize(lambda: train_examples_student)
|
||||
|
||||
# Since Language.distill creates a copy of the examples to use as
|
||||
# its internal teacher/student docs, we'll need to monkey-patch the
|
||||
# tok2vec pipe's distill method.
|
||||
student_tok2vec = student_nlp.get_pipe("tok2vec")
|
||||
student_tok2vec._old_distill = student_tok2vec.distill
|
||||
|
||||
def tok2vec_distill_wrapper(
|
||||
self,
|
||||
teacher_pipe,
|
||||
examples,
|
||||
**kwargs,
|
||||
):
|
||||
assert all(not eg.reference.tensor.any() for eg in examples)
|
||||
out = self._old_distill(teacher_pipe, examples, **kwargs)
|
||||
assert all(eg.reference.tensor.any() for eg in examples)
|
||||
return out
|
||||
|
||||
student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
|
||||
student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import Callable
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Any, Dict
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import ensure_path, registry, load_model_from_config
|
||||
import srsly
|
||||
|
||||
from spacy import util, Errors
|
||||
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
|
@ -63,19 +66,21 @@ def _check_kb(kb):
|
|||
assert alias_string not in kb.get_alias_strings()
|
||||
|
||||
# check candidates & probabilities
|
||||
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
|
||||
candidates = sorted(
|
||||
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_
|
||||
)
|
||||
assert len(candidates) == 2
|
||||
|
||||
assert candidates[0].entity_ == "Q007"
|
||||
assert candidates[0].entity_id_ == "Q007"
|
||||
assert 6.999 < candidates[0].entity_freq < 7.01
|
||||
assert candidates[0].entity_vector == [0, 0, 7]
|
||||
assert candidates[0].alias_ == "double07"
|
||||
assert candidates[0].alias == "double07"
|
||||
assert 0.899 < candidates[0].prior_prob < 0.901
|
||||
|
||||
assert candidates[1].entity_ == "Q17"
|
||||
assert candidates[1].entity_id_ == "Q17"
|
||||
assert 1.99 < candidates[1].entity_freq < 2.01
|
||||
assert candidates[1].entity_vector == [7, 1, 0]
|
||||
assert candidates[1].alias_ == "double07"
|
||||
assert candidates[1].alias == "double07"
|
||||
assert 0.099 < candidates[1].prior_prob < 0.101
|
||||
|
||||
|
||||
|
@ -91,7 +96,10 @@ def test_serialize_subclassed_kb():
|
|||
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
||||
|
||||
[components.entity_linker.generate_empty_kb]
|
||||
@misc = "kb_test.CustomEmptyKB.v1"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.components]
|
||||
|
@ -99,7 +107,7 @@ def test_serialize_subclassed_kb():
|
|||
[initialize.components.entity_linker]
|
||||
|
||||
[initialize.components.entity_linker.kb_loader]
|
||||
@misc = "spacy.CustomKB.v1"
|
||||
@misc = "kb_test.CustomKB.v1"
|
||||
entity_vector_length = 342
|
||||
custom_field = 666
|
||||
"""
|
||||
|
@ -109,10 +117,57 @@ def test_serialize_subclassed_kb():
|
|||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.misc("spacy.CustomKB.v1")
|
||||
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True)
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def serialize_custom_fields(file_path: Path) -> None:
|
||||
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||
|
||||
serialize = {
|
||||
"contents": lambda p: self.write_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def deserialize_custom_fields(file_path: Path) -> None:
|
||||
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||
"contents": lambda p: self.read_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=0,
|
||||
)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
@registry.misc("kb_test.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||
def custom_kb_factory(vocab):
|
||||
kb = SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
|
@ -139,6 +194,6 @@ def test_serialize_subclassed_kb():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
# After IO, the KB is the standard one
|
||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||
assert entity_linker2.kb.custom_field == 666
|
||||
|
|
|
@ -181,7 +181,7 @@ def test_issue4042_bug2():
|
|||
@pytest.mark.issue(4725)
|
||||
def test_issue4725_1():
|
||||
"""Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
vocab = Vocab()
|
||||
nlp = English(vocab=vocab)
|
||||
config = {
|
||||
"update_with_oracle_cut_size": 111,
|
||||
|
|
|
@ -15,8 +15,11 @@ from spacy.lang.lex_attrs import norm
|
|||
|
||||
from ..util import make_tempdir
|
||||
|
||||
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
||||
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||
test_strings = [
|
||||
(StringStore(), StringStore()),
|
||||
(StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])),
|
||||
]
|
||||
test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")]
|
||||
|
||||
|
||||
@pytest.mark.issue(599)
|
||||
|
@ -84,7 +87,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|||
vocab2 = Vocab(strings=strings2)
|
||||
vocab1_b = vocab1.to_bytes()
|
||||
vocab2_b = vocab2.to_bytes()
|
||||
if strings1 == strings2:
|
||||
if strings1.to_bytes() == strings2.to_bytes():
|
||||
assert vocab1_b == vocab2_b
|
||||
else:
|
||||
assert vocab1_b != vocab2_b
|
||||
|
@ -121,11 +124,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
|||
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
||||
vocab1 = Vocab(strings=strings, lex_attr_getters={NORM: norm})
|
||||
vocab2 = Vocab(lex_attr_getters={NORM: norm})
|
||||
vocab1[strings[0]].norm_ = lex_attr
|
||||
assert vocab1[strings[0]].norm_ == lex_attr
|
||||
assert vocab2[strings[0]].norm_ != lex_attr
|
||||
s = next(iter(vocab1.strings))
|
||||
vocab1[s].norm_ = lex_attr
|
||||
assert vocab1[s].norm_ == lex_attr
|
||||
assert vocab2[s].norm_ != lex_attr
|
||||
vocab2 = vocab2.from_bytes(vocab1.to_bytes())
|
||||
assert vocab2[strings[0]].norm_ == lex_attr
|
||||
assert vocab2[s].norm_ == lex_attr
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||
|
@ -140,14 +144,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
|||
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
||||
vocab1 = Vocab(strings=strings, lex_attr_getters={NORM: norm})
|
||||
vocab2 = Vocab(lex_attr_getters={NORM: norm})
|
||||
vocab1[strings[0]].norm_ = lex_attr
|
||||
assert vocab1[strings[0]].norm_ == lex_attr
|
||||
assert vocab2[strings[0]].norm_ != lex_attr
|
||||
s = next(iter(vocab1.strings))
|
||||
vocab1[s].norm_ = lex_attr
|
||||
assert vocab1[s].norm_ == lex_attr
|
||||
assert vocab2[s].norm_ != lex_attr
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "vocab"
|
||||
vocab1.to_disk(file_path)
|
||||
vocab2 = vocab2.from_disk(file_path)
|
||||
assert vocab2[strings[0]].norm_ == lex_attr
|
||||
assert vocab2[s].norm_ == lex_attr
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import math
|
||||
from collections import Counter
|
||||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -1017,8 +1016,6 @@ def test_local_remote_storage_pull_missing():
|
|||
|
||||
|
||||
def test_cli_find_threshold(capsys):
|
||||
thresholds = numpy.linspace(0, 1, 10)
|
||||
|
||||
def make_examples(nlp: Language) -> List[Example]:
|
||||
docs: List[Example] = []
|
||||
|
||||
|
@ -1082,8 +1079,6 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
assert best_threshold != thresholds[0]
|
||||
assert thresholds[0] < best_threshold < thresholds[9]
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
|
@ -1091,7 +1086,7 @@ def test_cli_find_threshold(capsys):
|
|||
nlp, _ = init_nlp((("spancat", {}),))
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
best_threshold, best_score, res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="spancat",
|
||||
|
@ -1099,10 +1094,8 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="spans_sc_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[8]
|
||||
assert res[1] >= 0.6
|
||||
assert res[2][1.0] == 0.0
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
||||
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
||||
|
@ -1132,6 +1125,7 @@ def test_cli_find_threshold(capsys):
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"reqs,output",
|
||||
[
|
||||
|
@ -1164,6 +1158,8 @@ def test_cli_find_threshold(capsys):
|
|||
],
|
||||
)
|
||||
def test_project_check_requirements(reqs, output):
|
||||
import pkg_resources
|
||||
|
||||
# excessive guard against unlikely package name
|
||||
try:
|
||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
import srsly
|
||||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
|
@ -38,8 +40,8 @@ def test_benchmark_accuracy_alias():
|
|||
# Verify that the `evaluate` alias works correctly.
|
||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
||||
"spacy evaluate", "spacy benchmark accuracy"
|
||||
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
||||
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
||||
)
|
||||
|
||||
|
||||
|
@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
|||
# Instead of checking specific wording of the output, which may change,
|
||||
# we'll check that this section of the debug output is present.
|
||||
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||
|
||||
|
||||
# project tests
|
||||
|
||||
SAMPLE_PROJECT = {
|
||||
"title": "Sample project",
|
||||
"description": "This is a project for testing",
|
||||
"assets": [
|
||||
{
|
||||
"dest": "assets/spacy-readme.md",
|
||||
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
|
||||
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
|
||||
},
|
||||
{
|
||||
"dest": "assets/citation.cff",
|
||||
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
|
||||
"checksum": "c996bfd80202d480eb2e592369714e5e",
|
||||
"extra": True,
|
||||
},
|
||||
],
|
||||
"commands": [
|
||||
{
|
||||
"name": "ok",
|
||||
"help": "print ok",
|
||||
"script": ["python -c \"print('okokok')\""],
|
||||
},
|
||||
{
|
||||
"name": "create",
|
||||
"help": "make a file",
|
||||
"script": ["touch abc.txt"],
|
||||
"outputs": ["abc.txt"],
|
||||
},
|
||||
{
|
||||
"name": "clean",
|
||||
"help": "remove test file",
|
||||
"script": ["rm abc.txt"],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def project_dir():
|
||||
with make_tempdir() as pdir:
|
||||
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
|
||||
yield pdir
|
||||
|
||||
|
||||
def test_project_document(project_dir):
|
||||
readme_path = project_dir / "README.md"
|
||||
assert not readme_path.exists(), "README already exists"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert readme_path.is_file()
|
||||
text = readme_path.read_text("utf-8")
|
||||
assert SAMPLE_PROJECT["description"] in text
|
||||
|
||||
|
||||
def test_project_assets(project_dir):
|
||||
asset_dir = project_dir / "assets"
|
||||
assert not asset_dir.exists(), "Assets dir is already present"
|
||||
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
|
||||
# check that extras work
|
||||
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
|
||||
|
||||
|
||||
def test_project_run(project_dir):
|
||||
# make sure dry run works
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "run", "--dry", "create", str(project_dir)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert "okokok" in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
"",
|
||||
# "--sparse",
|
||||
"--branch v3",
|
||||
"--repo https://github.com/explosion/projects --branch v3",
|
||||
],
|
||||
)
|
||||
def test_project_clone(options):
|
||||
with make_tempdir() as workspace:
|
||||
out = workspace / "project"
|
||||
target = "benchmarks/ner_conll03"
|
||||
if not options:
|
||||
options = []
|
||||
else:
|
||||
options = options.split()
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "clone", target, *options, str(out)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (out / "README.md").is_file()
|
||||
|
||||
|
||||
def test_project_push_pull(project_dir):
|
||||
proj = dict(SAMPLE_PROJECT)
|
||||
remote = "xyz"
|
||||
|
||||
with make_tempdir() as remote_dir:
|
||||
proj["remotes"] = {remote: str(remote_dir)}
|
||||
proj_text = srsly.yaml_dumps(proj)
|
||||
(project_dir / "project.yml").write_text(proj_text)
|
||||
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.exists()
|
||||
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
|
|
|
@ -10,8 +10,9 @@ from spacy.training import Example
|
|||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||
from spacy.util import load_model_from_config
|
||||
import spacy
|
||||
from thinc.api import CupyOps, NumpyOps, get_current_ops
|
||||
from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
|
||||
|
||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||
|
||||
|
@ -25,6 +26,51 @@ try:
|
|||
except ImportError:
|
||||
pass
|
||||
|
||||
TAGGER_CFG_STRING = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec","tagger"]
|
||||
|
||||
[components]
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
rows = [2000, 1000, 1000, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
"""
|
||||
|
||||
|
||||
TAGGER_TRAIN_DATA = [
|
||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||
]
|
||||
|
||||
|
||||
TAGGER_TRAIN_DATA = [
|
||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||
|
@ -52,7 +98,7 @@ def assert_sents_error(doc):
|
|||
|
||||
def warn_error(proc_name, proc, docs, e):
|
||||
logger = logging.getLogger("spacy")
|
||||
logger.warning(f"Trouble with component {proc_name}.")
|
||||
logger.warning("Trouble with component %s.", proc_name)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -91,6 +137,44 @@ def test_language_update(nlp):
|
|||
example = Example.from_dict(doc, wrongkeyannots)
|
||||
|
||||
|
||||
def test_language_update_updates():
|
||||
config = Config().from_str(TAGGER_CFG_STRING)
|
||||
nlp = load_model_from_config(config, auto_fill=True, validate=True)
|
||||
|
||||
train_examples = []
|
||||
for t in TAGGER_TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||
|
||||
xp = get_array_module(docs_after_update[0].tensor)
|
||||
assert xp.any(
|
||||
xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
|
||||
)
|
||||
|
||||
|
||||
def test_language_update_does_not_update_with_sgd_false():
|
||||
config = Config().from_str(TAGGER_CFG_STRING)
|
||||
nlp = load_model_from_config(config, auto_fill=True, validate=True)
|
||||
|
||||
train_examples = []
|
||||
for t in TAGGER_TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||
nlp.update(train_examples, sgd=False)
|
||||
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||
|
||||
xp = get_array_module(docs_after_update[0].tensor)
|
||||
xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
|
||||
|
||||
|
||||
def test_language_evaluate(nlp):
|
||||
text = "hello world"
|
||||
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import numpy
|
||||
import tempfile
|
||||
import contextlib
|
||||
import re
|
||||
import srsly
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
|
|||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||
assert k1 == k2
|
||||
assert v1 == v2
|
||||
|
||||
|
||||
def normalize_whitespace(s):
|
||||
return re.sub(r"\s+", " ", s)
|
||||
|
|
|
@ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2):
|
|||
|
||||
@pytest.mark.issue(600)
|
||||
def test_issue600():
|
||||
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
|
||||
vocab = Vocab()
|
||||
doc = Doc(vocab, words=["hello"])
|
||||
doc[0].tag_ = "NN"
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@ def test_issue1539():
|
|||
@pytest.mark.issue(1807)
|
||||
def test_issue1807():
|
||||
"""Test vocab.set_vector also adds the word to the vocab."""
|
||||
vocab = Vocab(vectors_name="test_issue1807")
|
||||
vocab = Vocab()
|
||||
assert "hello" not in vocab
|
||||
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
|
||||
assert "hello" in vocab
|
||||
|
@ -94,13 +94,12 @@ def test_issue1807():
|
|||
def test_issue2871():
|
||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||
words = ["dog", "cat", "SUFFIX"]
|
||||
vocab = Vocab(vectors_name="test_issue2871")
|
||||
vocab = Vocab()
|
||||
vocab.vectors.resize(shape=(3, 10))
|
||||
vector_data = numpy.zeros((3, 10), dtype="f")
|
||||
for word in words:
|
||||
_ = vocab[word] # noqa: F841
|
||||
vocab.set_vector(word, vector_data[0])
|
||||
vocab.vectors.name = "dummy_vectors"
|
||||
assert vocab["dog"].rank == 0
|
||||
assert vocab["cat"].rank == 1
|
||||
assert vocab["SUFFIX"].rank == 2
|
||||
|
@ -125,7 +124,7 @@ def test_issue4725_2():
|
|||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
|
||||
# or because of issues with pickling the NER (cf test_issue4725_1)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
vocab = Vocab()
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
|
@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
|||
|
||||
|
||||
def test_vocab_add_vector():
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
vocab = Vocab()
|
||||
data = OPS.xp.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
|
@ -356,7 +355,7 @@ def test_vocab_add_vector():
|
|||
|
||||
|
||||
def test_vocab_prune_vectors():
|
||||
vocab = Vocab(vectors_name="test_vocab_prune_vectors")
|
||||
vocab = Vocab()
|
||||
_ = vocab["cat"] # noqa: F841
|
||||
_ = vocab["dog"] # noqa: F841
|
||||
_ = vocab["kitten"] # noqa: F841
|
||||
|
@ -405,7 +404,7 @@ def test_vectors_serialize():
|
|||
|
||||
|
||||
def test_vector_is_oov():
|
||||
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
||||
vocab = Vocab()
|
||||
data = OPS.xp.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
|
|
|
@ -105,9 +105,11 @@ class Doc:
|
|||
start_idx: int,
|
||||
end_idx: int,
|
||||
label: Union[int, str] = ...,
|
||||
*,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
||||
@property
|
||||
|
@ -126,12 +128,12 @@ class Doc:
|
|||
blocked: Optional[List[Span]] = ...,
|
||||
missing: Optional[List[Span]] = ...,
|
||||
outside: Optional[List[Span]] = ...,
|
||||
default: str = ...
|
||||
default: str = ...,
|
||||
) -> None: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
def noun_chunks(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def sents(self) -> Iterator[Span]: ...
|
||||
def sents(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def lang(self) -> int: ...
|
||||
@property
|
||||
|
|
|
@ -520,7 +520,7 @@ cdef class Doc:
|
|||
def doc(self):
|
||||
return self
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
|
||||
def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
|
||||
"""Create a `Span` object from the slice
|
||||
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
|
||||
created.
|
||||
|
@ -528,9 +528,9 @@ cdef class Doc:
|
|||
doc (Doc): The parent document.
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
|
||||
named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
|
@ -539,6 +539,7 @@ cdef class Doc:
|
|||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
|
@ -656,9 +657,6 @@ cdef class Doc:
|
|||
elif self.vocab.vectors.size > 0:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
elif self.tensor.size > 0:
|
||||
self._vector = self.tensor.mean(axis=0)
|
||||
return self._vector
|
||||
else:
|
||||
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
||||
|
||||
|
@ -705,10 +703,10 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property ents:
|
||||
"""The named entities in the document. Returns a tuple of named entity
|
||||
"""The named entities in the document. Returns a list of named entity
|
||||
`Span` objects, if the entity recognizer has been applied.
|
||||
|
||||
RETURNS (tuple): Entities in the document, one `Span` per entity.
|
||||
RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#ents
|
||||
"""
|
||||
|
@ -866,7 +864,7 @@ cdef class Doc:
|
|||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
RETURNS (Tuple[Span]): Noun chunks in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#noun_chunks
|
||||
"""
|
||||
|
@ -875,36 +873,35 @@ cdef class Doc:
|
|||
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenization from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
# its tokenization changing, so it's okay once we have the Span
|
||||
# objects. See Issue #375.
|
||||
# during the iteration.
|
||||
spans = []
|
||||
for start, end, label in self.noun_chunks_iterator(self):
|
||||
spans.append(Span(self, start, end, label=label))
|
||||
for span in spans:
|
||||
yield span
|
||||
return tuple(spans)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||
objects. Sentence spans have no label.
|
||||
|
||||
YIELDS (Span): Sentences in the document.
|
||||
RETURNS (Tuple[Span]): Sentences in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#sents
|
||||
"""
|
||||
if not self.has_annotation("SENT_START"):
|
||||
raise ValueError(Errors.E030)
|
||||
if "sents" in self.user_hooks:
|
||||
yield from self.user_hooks["sents"](self)
|
||||
return tuple(self.user_hooks["sents"](self))
|
||||
else:
|
||||
start = 0
|
||||
spans = []
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == 1:
|
||||
yield Span(self, start, i)
|
||||
spans.append(Span(self, start, i))
|
||||
start = i
|
||||
if start != self.length:
|
||||
yield Span(self, start, self.length)
|
||||
spans.append(Span(self, start, self.length))
|
||||
return tuple(spans)
|
||||
|
||||
@property
|
||||
def lang(self):
|
||||
|
@ -1604,7 +1601,7 @@ cdef class Doc:
|
|||
for span_group in doc_json.get("spans", {}):
|
||||
spans = []
|
||||
for span in doc_json["spans"][span_group]:
|
||||
char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"])
|
||||
char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
|
||||
if char_span is None:
|
||||
raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
|
||||
spans.append(char_span)
|
||||
|
|
|
@ -74,6 +74,8 @@ class Span:
|
|||
@property
|
||||
def ents(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def sents(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def has_vector(self) -> bool: ...
|
||||
@property
|
||||
def vector(self) -> Floats1d: ...
|
||||
|
@ -86,7 +88,7 @@ class Span:
|
|||
@property
|
||||
def text_with_ws(self) -> str: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
def noun_chunks(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def root(self) -> Token: ...
|
||||
def char_span(
|
||||
|
@ -94,8 +96,11 @@ class Span:
|
|||
start_idx: int,
|
||||
end_idx: int,
|
||||
label: Union[int, str] = ...,
|
||||
*,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
@property
|
||||
def conjuncts(self) -> Tuple[Token]: ...
|
||||
|
|
|
@ -134,10 +134,8 @@ cdef class Span:
|
|||
else:
|
||||
return True
|
||||
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
cdef SpanC* other_span_c = other.span_c()
|
||||
self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc)
|
||||
other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc)
|
||||
self_tuple = self._cmp_tuple()
|
||||
other_tuple = other._cmp_tuple()
|
||||
# <
|
||||
if op == 0:
|
||||
return self_tuple < other_tuple
|
||||
|
@ -158,8 +156,20 @@ cdef class Span:
|
|||
return self_tuple >= other_tuple
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._cmp_tuple())
|
||||
|
||||
def _cmp_tuple(self):
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id))
|
||||
return (
|
||||
span_c.start_char,
|
||||
span_c.end_char,
|
||||
span_c.start,
|
||||
span_c.end,
|
||||
span_c.label,
|
||||
span_c.kb_id,
|
||||
span_c.id,
|
||||
self.doc,
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of tokens in the span.
|
||||
|
@ -382,7 +392,7 @@ cdef class Span:
|
|||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
|
@ -451,20 +461,21 @@ cdef class Span:
|
|||
"""Obtain the sentences that contain this span. If the given span
|
||||
crosses sentence boundaries, return all sentences it is a part of.
|
||||
|
||||
RETURNS (Iterable[Span]): All sentences that the span is a part of.
|
||||
RETURNS (Tuple[Span]): All sentences that the span is a part of.
|
||||
|
||||
DOCS: https://spacy.io/api/span#sents
|
||||
DOCS: https://spacy.io/api/span#sents
|
||||
"""
|
||||
cdef int start
|
||||
cdef int i
|
||||
|
||||
if "sents" in self.doc.user_span_hooks:
|
||||
yield from self.doc.user_span_hooks["sents"](self)
|
||||
elif "sents" in self.doc.user_hooks:
|
||||
return tuple(self.doc.user_span_hooks["sents"](self))
|
||||
spans = []
|
||||
if "sents" in self.doc.user_hooks:
|
||||
for sentence in self.doc.user_hooks["sents"](self.doc):
|
||||
if sentence.end > self.start:
|
||||
if sentence.start < self.end or sentence.start == self.start == self.end:
|
||||
yield sentence
|
||||
spans.append(sentence)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
|
@ -479,12 +490,13 @@ cdef class Span:
|
|||
# Now, find all the sentences in the span
|
||||
for i in range(start + 1, self.doc.length):
|
||||
if self.doc.c[i].sent_start == 1:
|
||||
yield Span(self.doc, start, i)
|
||||
spans.append(Span(self.doc, start, i))
|
||||
start = i
|
||||
if start >= self.end:
|
||||
break
|
||||
if start < self.end:
|
||||
yield Span(self.doc, start, self.end)
|
||||
spans.append(Span(self.doc, start, self.end))
|
||||
return tuple(spans)
|
||||
|
||||
|
||||
@property
|
||||
|
@ -492,7 +504,7 @@ cdef class Span:
|
|||
"""The named entities that fall completely within the span. Returns
|
||||
a tuple of `Span` objects.
|
||||
|
||||
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
||||
RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
|
||||
|
||||
DOCS: https://spacy.io/api/span#ents
|
||||
"""
|
||||
|
@ -507,7 +519,7 @@ cdef class Span:
|
|||
ents.append(ent)
|
||||
else:
|
||||
break
|
||||
return ents
|
||||
return tuple(ents)
|
||||
|
||||
@property
|
||||
def has_vector(self):
|
||||
|
@ -522,8 +534,6 @@ cdef class Span:
|
|||
return self.doc.user_span_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.size > 0:
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.doc.tensor.size > 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
@ -605,13 +615,15 @@ cdef class Span:
|
|||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the span.
|
||||
RETURNS (Tuple[Span]): Noun chunks in the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#noun_chunks
|
||||
"""
|
||||
spans = []
|
||||
for span in self.doc.noun_chunks:
|
||||
if span.start >= self.start and span.end <= self.end:
|
||||
yield span
|
||||
spans.append(span)
|
||||
return tuple(spans)
|
||||
|
||||
@property
|
||||
def root(self):
|
||||
|
@ -656,22 +668,28 @@ cdef class Span:
|
|||
else:
|
||||
return self.doc[root]
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
|
||||
def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
|
||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
alignment_mode (str): How character indices are aligned to token
|
||||
boundaries. Options: "strict" (character indices must be aligned
|
||||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
start_idx += span_c.start_char
|
||||
end_idx += span_c.start_char
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
|
|
|
@ -389,8 +389,6 @@ cdef class Token:
|
|||
"""
|
||||
if "has_vector" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["has_vector"](self)
|
||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||
return True
|
||||
return self.vocab.has_vector(self.c.lex.orth)
|
||||
|
||||
@property
|
||||
|
@ -404,8 +402,6 @@ cdef class Token:
|
|||
"""
|
||||
if "vector" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["vector"](self)
|
||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||
return self.doc.tensor[self.i]
|
||||
else:
|
||||
return self.vocab.get_vector(self.c.lex.orth)
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ def create_copy_from_base_model(
|
|||
) -> Callable[[Language], Language]:
|
||||
def copy_from_base_model(nlp):
|
||||
if tokenizer:
|
||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
||||
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||
base_nlp = load_model(tokenizer)
|
||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||
|
@ -23,7 +23,7 @@ def create_copy_from_base_model(
|
|||
)
|
||||
)
|
||||
if vocab:
|
||||
logger.info(f"Copying vocab from: {vocab}")
|
||||
logger.info("Copying vocab from: %s", vocab)
|
||||
# only reload if the vocab is from a different model
|
||||
if tokenizer != vocab:
|
||||
base_nlp = load_model(vocab)
|
||||
|
|
|
@ -29,7 +29,7 @@ def create_docbin_reader(
|
|||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
util.logger.debug(f"Loading corpus from path: {path}")
|
||||
util.logger.debug("Loading corpus from path: %s", path)
|
||||
return Corpus(
|
||||
path,
|
||||
gold_preproc=gold_preproc,
|
||||
|
|
|
@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
frozen_components = T["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced if p not in frozen_components]
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
logger.info("Pipeline: %s", nlp.pipe_names)
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
logger.info(f"Resuming training for: {resume_components}")
|
||||
logger.info("Resuming training for: %s", resume_components)
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
# Make sure that listeners are defined before initializing further
|
||||
nlp._link_components()
|
||||
|
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
if T["max_epochs"] == -1:
|
||||
sample_size = 100
|
||||
logger.debug(
|
||||
f"Due to streamed train corpus, using only first {sample_size} "
|
||||
f"examples for initialization. If necessary, provide all labels "
|
||||
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
||||
"Due to streamed train corpus, using only first %s examples for initialization. "
|
||||
"If necessary, provide all labels in [initialize]. "
|
||||
"More info: https://spacy.io/api/cli#init_labels",
|
||||
sample_size,
|
||||
)
|
||||
nlp.initialize(
|
||||
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
||||
)
|
||||
else:
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
|
||||
# Detect components with listeners that are not frozen consistently
|
||||
for name, proc in nlp.pipeline:
|
||||
for listener in getattr(
|
||||
|
@ -109,7 +110,7 @@ def init_vocab(
|
|||
) -> None:
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
|
||||
data_path = ensure_path(data)
|
||||
if data_path is not None:
|
||||
lex_attrs = srsly.read_jsonl(data_path)
|
||||
|
@ -125,11 +126,11 @@ def init_vocab(
|
|||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
|
||||
logger.info("Created vocabulary")
|
||||
if vectors is not None:
|
||||
load_vectors_into_model(nlp, vectors)
|
||||
logger.info(f"Added vectors: {vectors}")
|
||||
logger.info("Added vectors: %s", vectors)
|
||||
# warn if source model vectors are not identical
|
||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
|
@ -191,7 +192,7 @@ def init_tok2vec(
|
|||
if weights_data is not None:
|
||||
layer = get_tok2vec_ref(nlp, P)
|
||||
layer.from_bytes(weights_data)
|
||||
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
||||
logger.info("Loaded pretrained weights from %s", init_tok2vec)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -202,7 +203,6 @@ def convert_vectors(
|
|||
*,
|
||||
truncate: int,
|
||||
prune: int,
|
||||
name: Optional[str] = None,
|
||||
mode: str = VectorsMode.default,
|
||||
) -> None:
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
|
@ -216,13 +216,13 @@ def convert_vectors(
|
|||
nlp.vocab.deduplicate_vectors()
|
||||
else:
|
||||
if vectors_loc:
|
||||
logger.info(f"Reading vectors from {vectors_loc}")
|
||||
logger.info("Reading vectors from %s", vectors_loc)
|
||||
vectors_data, vector_keys, floret_settings = read_vectors(
|
||||
vectors_loc,
|
||||
truncate,
|
||||
mode=mode,
|
||||
)
|
||||
logger.info(f"Loaded vectors from {vectors_loc}")
|
||||
logger.info("Loaded vectors from %s", vectors_loc)
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None and mode != VectorsMode.floret:
|
||||
|
@ -241,12 +241,6 @@ def convert_vectors(
|
|||
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
||||
)
|
||||
nlp.vocab.deduplicate_vectors()
|
||||
if name is None:
|
||||
# TODO: Is this correct? Does this matter?
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
if prune >= 1 and mode != VectorsMode.floret:
|
||||
nlp.vocab.prune_vectors(prune)
|
||||
|
||||
|
|
|
@ -210,7 +210,7 @@ def train_while_improving(
|
|||
subbatch,
|
||||
drop=dropout,
|
||||
losses=losses,
|
||||
sgd=False, # type: ignore[arg-type]
|
||||
sgd=False,
|
||||
exclude=exclude,
|
||||
annotates=annotating_components,
|
||||
)
|
||||
|
@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
|
|||
if subdir.exists():
|
||||
try:
|
||||
shutil.rmtree(str(subdir))
|
||||
logger.debug(f"Removed existing output directory: {subdir}")
|
||||
logger.debug("Removed existing output directory: %s", subdir)
|
||||
except Exception as e:
|
||||
raise IOError(Errors.E901.format(path=path)) from e
|
||||
|
|
|
@ -33,6 +33,7 @@ import inspect
|
|||
import pkgutil
|
||||
import logging
|
||||
import socket
|
||||
import stat
|
||||
|
||||
try:
|
||||
import cupy.random
|
||||
|
@ -55,7 +56,7 @@ if TYPE_CHECKING:
|
|||
# fmt: off
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
DEFAULT_OOV_PROB = -20
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config file. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
|
@ -139,8 +140,17 @@ class registry(thinc.registry):
|
|||
return func
|
||||
|
||||
@classmethod
|
||||
def find(cls, registry_name: str, func_name: str) -> Callable:
|
||||
"""Get info about a registered function from the registry."""
|
||||
def find(
|
||||
cls, registry_name: str, func_name: str
|
||||
) -> Dict[str, Optional[Union[str, int]]]:
|
||||
"""Find information about a registered function, including the
|
||||
module and path to the file it's defined in, the line number and the
|
||||
docstring, if available.
|
||||
|
||||
registry_name (str): Name of the catalogue registry.
|
||||
func_name (str): Name of the registered function.
|
||||
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
|
||||
"""
|
||||
# We're overwriting this classmethod so we're able to provide more
|
||||
# specific error messages and implement a fallback to spacy-legacy.
|
||||
if not hasattr(cls, registry_name):
|
||||
|
@ -1030,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]:
|
|||
"""
|
||||
d = Path(tempfile.mkdtemp())
|
||||
yield d
|
||||
|
||||
# On Windows, git clones use read-only files, which cause permission errors
|
||||
# when being deleted. This forcibly fixes permissions.
|
||||
def force_remove(rmfunc, path, ex):
|
||||
os.chmod(path, stat.S_IWRITE)
|
||||
rmfunc(path)
|
||||
|
||||
try:
|
||||
shutil.rmtree(str(d))
|
||||
shutil.rmtree(str(d), onerror=force_remove)
|
||||
except PermissionError as e:
|
||||
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
||||
|
||||
|
|
|
@ -52,7 +52,6 @@ cdef class Vectors:
|
|||
DOCS: https://spacy.io/api/vectors
|
||||
"""
|
||||
cdef public object strings
|
||||
cdef public object name
|
||||
cdef readonly object mode
|
||||
cdef public object data
|
||||
cdef public object key2row
|
||||
|
@ -64,14 +63,13 @@ cdef class Vectors:
|
|||
cdef readonly unicode bow
|
||||
cdef readonly unicode eow
|
||||
|
||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
||||
"""Create a new vector store.
|
||||
|
||||
strings (StringStore): The string store.
|
||||
shape (tuple): Size of the table, as (# entries, # columns)
|
||||
data (numpy.ndarray or cupy.ndarray): The vector data.
|
||||
keys (iterable): A sequence of keys, aligned with the data.
|
||||
name (str): A name to identify the vectors table.
|
||||
mode (str): Vectors mode: "default" or "floret" (default: "default").
|
||||
minn (int): The floret char ngram minn (default: 0).
|
||||
maxn (int): The floret char ngram maxn (default: 0).
|
||||
|
@ -85,7 +83,6 @@ cdef class Vectors:
|
|||
self.strings = strings
|
||||
if self.strings is None:
|
||||
self.strings = StringStore()
|
||||
self.name = name
|
||||
if mode not in Mode.values():
|
||||
raise ValueError(
|
||||
Errors.E202.format(
|
||||
|
|
|
@ -11,7 +11,8 @@ from .vectors import Vectors
|
|||
from pathlib import Path
|
||||
|
||||
def create_vocab(
|
||||
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
|
||||
lang: Optional[str],
|
||||
defaults: Any,
|
||||
) -> Vocab: ...
|
||||
|
||||
class Vocab:
|
||||
|
@ -25,10 +26,9 @@ class Vocab:
|
|||
def __init__(
|
||||
self,
|
||||
lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ...,
|
||||
strings: Optional[Union[List[str], StringStore]] = ...,
|
||||
strings: Optional[StringStore] = ...,
|
||||
lookups: Optional[Lookups] = ...,
|
||||
oov_prob: float = ...,
|
||||
vectors_name: Optional[str] = ...,
|
||||
writing_system: Dict[str, Any] = ...,
|
||||
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
|
||||
) -> None: ...
|
||||
|
|
|
@ -24,7 +24,7 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, vectors_name=None):
|
||||
def create_vocab(lang, defaults):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
|
@ -38,7 +38,6 @@ def create_vocab(lang, defaults, vectors_name=None):
|
|||
lex_attr_data=defaults.lex_attr_data,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
vectors_name=vectors_name,
|
||||
)
|
||||
|
||||
|
||||
|
@ -49,10 +48,9 @@ cdef class Vocab:
|
|||
|
||||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
get_noun_chunks=None, lang="", lex_attr_data=None,
|
||||
**deprecated_kwargs):
|
||||
def __init__(self, lex_attr_getters=None, strings=None, lookups=None,
|
||||
oov_prob=-20., writing_system=None, get_noun_chunks=None,
|
||||
lang="", lex_attr_data=None):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
|
@ -61,7 +59,6 @@ cdef class Vocab:
|
|||
vice versa.
|
||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||
oov_prob (float): Default OOV probability.
|
||||
vectors_name (str): Optional name to identify the vectors table.
|
||||
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
|
||||
A function that yields base noun phrases used for Doc.noun_chunks.
|
||||
"""
|
||||
|
@ -73,17 +70,20 @@ cdef class Vocab:
|
|||
self.lang = lang
|
||||
self.mem = Pool()
|
||||
self._by_orth = PreshMap()
|
||||
self.strings = StringStore()
|
||||
self.length = 0
|
||||
if strings:
|
||||
for string in strings:
|
||||
_ = self[string]
|
||||
if strings is None:
|
||||
self.strings = StringStore()
|
||||
else:
|
||||
self.strings = strings
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.lex_attr_data = lex_attr_data
|
||||
self.morphology = Morphology(self.strings)
|
||||
self.vectors = Vectors(strings=self.strings, name=vectors_name)
|
||||
self.vectors = Vectors(strings=self.strings)
|
||||
self.lookups = lookups
|
||||
self.writing_system = writing_system
|
||||
if writing_system is None:
|
||||
self.writing_system = {}
|
||||
else:
|
||||
self.writing_system = writing_system
|
||||
self.get_noun_chunks = get_noun_chunks
|
||||
|
||||
property vectors:
|
||||
|
@ -304,7 +304,7 @@ cdef class Vocab:
|
|||
for key, row in self.vectors.key2row.items()
|
||||
}
|
||||
# replace vectors with deduplicated version
|
||||
self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
|
||||
self.vectors = Vectors(strings=self.strings, data=data)
|
||||
for key, row in key2row.items():
|
||||
self.vectors.add(key, row=row)
|
||||
|
||||
|
@ -354,7 +354,7 @@ cdef class Vocab:
|
|||
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
||||
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
|
||||
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
|
||||
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
|
||||
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
|
||||
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
|
||||
syn_keys = ops.to_numpy(syn_keys)
|
||||
remap = {}
|
||||
|
|
|
@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
|||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created.
|
||||
instance.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||
|
||||
### spacy.EmptyKB.v2 {id="EmptyKB"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created. It
|
||||
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
|
||||
|
||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
||||
|
||||
A function that reads an existing `KnowledgeBase` from file.
|
||||
|
@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
|||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
|
||||
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
|
||||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
|
||||
[`Span`](/api/span) objects denoting named entities, and returns a list of
|
||||
plausible [`Candidate`](/api/kb/#candidate) objects per specified
|
||||
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
|
||||
mention to find its potential aliases in the `KnowledgeBase`. Note that this
|
||||
function is case-dependent.
|
||||
|
||||
## Coreference {id="coref-architectures",tag="experimental"}
|
||||
|
||||
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
||||
|
|
|
@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`.
|
|||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
|||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
|
||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
|
@ -1410,12 +1409,13 @@ $ python -m spacy project assets [project_dir]
|
|||
> $ python -m spacy project assets [--sparse]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~ |
|
||||
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||
|
||||
### project run {id="project-run",tag="command"}
|
||||
|
||||
|
@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
|
|||
### project pull {id="project-pull",tag="command"}
|
||||
|
||||
Download all files or directories listed as `outputs` for commands, unless they
|
||||
are not already present locally. When searching for files in the remote, `pull`
|
||||
are already present locally. When searching for files in the remote, `pull`
|
||||
won't just look at the output path, but will also consider the **command
|
||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||
previously pushed a checkpoint to the remote, but now you've changed some
|
||||
|
|
|
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
|
@ -209,15 +209,17 @@ alignment mode `"strict".
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.set_ents {id="set_ents",tag="method",version="3"}
|
||||
|
||||
|
@ -652,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
|
|||
|
||||
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
|
||||
|
||||
Iterate over the base noun phrases in the document. Yields base noun-phrase
|
||||
`Span` objects, if the document has been syntactically parsed. A base noun
|
||||
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
|
||||
nested within it – so no NP-level coordination, no prepositional phrases, and no
|
||||
relative clauses.
|
||||
Returns a tuple of the base noun phrases in the doc, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
|
||||
does not permit other NPs to be nested within it – so no NP-level coordination,
|
||||
no prepositional phrases, and no relative clauses.
|
||||
|
||||
To customize the noun chunk iterator in a loaded pipeline, modify
|
||||
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
|
||||
|
@ -673,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
|
|||
> assert chunks[1].text == "another phrase"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------- |
|
||||
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------- |
|
||||
| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
|
||||
|
||||
## Doc.sents {id="sents",tag="property",model="sentences"}
|
||||
|
||||
Iterate over the sentences in the document. Sentence spans have no label.
|
||||
Returns a tuple of the sentences in the document. Sentence spans have no label.
|
||||
|
||||
This property is only available when
|
||||
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
|
||||
|
@ -695,9 +696,9 @@ will raise an error otherwise.
|
|||
> assert [s.root.text for s in sents] == ["is", "'s"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ----------------------------------- |
|
||||
| **YIELDS** | Sentences in the document. ~~Span~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------ |
|
||||
| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
|
||||
|
||||
## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
|
||||
|
||||
|
|
|
@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||
|
|
|
@ -10,9 +10,9 @@ version: 3.5
|
|||
|
||||
The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
|
||||
implements all of its methods. It stores all KB data in-memory and generates
|
||||
[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
|
||||
entity names. It's highly optimized for both a low memory footprint and speed of
|
||||
retrieval.
|
||||
[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions
|
||||
with entity names. It's highly optimized for both a low memory footprint and
|
||||
speed of retrieval.
|
||||
|
||||
## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base.
|
|||
## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb#candidate). Wraps
|
||||
of type [`InMemoryCandidate`](/api/kb#candidate). Wraps
|
||||
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
> #### Example
|
||||
|
@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps
|
|||
> candidates = kb.get_candidates(doc[0:2])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||
| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------ |
|
||||
| `mention` | The textual mention or alias. ~~Span~~ |
|
||||
| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
|
||||
|
||||
|
@ -189,31 +189,16 @@ to you.
|
|||
>
|
||||
> ```python
|
||||
> from spacy.lang.en import English
|
||||
> from spacy.tokens import SpanGroup
|
||||
> nlp = English()
|
||||
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||
> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb#candidate).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> candidates = kb.get_alias_candidates("Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------ |
|
||||
| `mentions` | The textual mentions. ~~SpanGroup~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
|
||||
|
||||
## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
|
||||
|
||||
|
|
|
@ -93,33 +93,17 @@ to you.
|
|||
>
|
||||
> ```python
|
||||
> from spacy.lang.en import English
|
||||
> from spacy.tokens import SpanGroup
|
||||
> nlp = English()
|
||||
> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
|
||||
> candidates = kb.get_candidates((doc[0:2], doc[3:]))
|
||||
> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||
| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
|
||||
| `mentions` | The textual mentions. ~~SpanGroup~~ |
|
||||
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
|
||||
|
||||
## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
|
||||
|
||||
<Infobox variant="warning">
|
||||
This method is _not_ available from spaCy 3.5 onwards.
|
||||
</Infobox>
|
||||
|
||||
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
|
||||
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
|
||||
allow more flexibility in customizing knowledge bases. Some of its methods were
|
||||
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
|
||||
one of those being `get_alias_candidates()`. This method is now available as
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
Note:
|
||||
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
|
||||
defaults to
|
||||
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
|
||||
|
||||
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
|
||||
|
||||
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||
|
@ -190,25 +174,25 @@ Restore the state of the knowledge base from a given directory. Note that the
|
|||
| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
||||
|
||||
## Candidate {id="candidate",tag="class"}
|
||||
## InMemoryCandidate {id="candidate",tag="class"}
|
||||
|
||||
A `Candidate` object refers to a textual mention (alias) that may or may not be
|
||||
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
|
||||
for the entity linking algorithm which will disambiguate the various candidates
|
||||
to the correct one. Each candidate `(alias, entity)` pair is assigned to a
|
||||
certain prior probability.
|
||||
An `InMemoryCandidate` object refers to a textual mention (alias) that may or
|
||||
may not be resolved to a specific entity from a `KnowledgeBase`. This will be
|
||||
used as input for the entity linking algorithm which will disambiguate the
|
||||
various candidates to the correct one. Each candidate `(alias, entity)` pair is
|
||||
assigned to a certain prior probability.
|
||||
|
||||
### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
|
||||
|
||||
Construct a `Candidate` object. Usually this constructor is not called directly,
|
||||
but instead these objects are returned by the `get_candidates` method of the
|
||||
[`entity_linker`](/api/entitylinker) pipe.
|
||||
Construct an `InMemoryCandidate` object. Usually this constructor is not called
|
||||
directly, but instead these objects are returned by the `get_candidates` method
|
||||
of the [`entity_linker`](/api/entitylinker) pipe.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.kb import Candidate
|
||||
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
|
||||
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -216,10 +200,10 @@ but instead these objects are returned by the `get_candidates` method of the
|
|||
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
||||
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
||||
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
||||
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
|
||||
| `alias_hash` | The hash of the entity alias. ~~int~~ |
|
||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||
|
||||
## Candidate attributes {id="candidate-attributes"}
|
||||
## InMemoryCandidate attributes {id="candidate-attributes"}
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------ |
|
||||
|
|
|
@ -323,15 +323,15 @@ and custom registered functions if needed. See the
|
|||
> nlp.update([example], sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. Defaults to `0.0`. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
|
||||
| `losses` | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Language.distill {id="distill",tag="method,experimental",version="4"}
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
|
|||
| Setting | Description |
|
||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
||||
|
|
|
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start_idx` | The index of the first character of the span. ~~int~~ |
|
||||
| `end_idx` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Span.similarity {id="similarity",tag="method",model="vectors"}
|
||||
|
||||
|
@ -272,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
|
|||
> assert ents[0].text == "Mr. Best"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------- |
|
||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------ |
|
||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
|
||||
|
||||
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
|
||||
|
||||
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
|
||||
objects, if the document has been syntactically parsed. A base noun phrase, or
|
||||
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
|
||||
it – so no NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
Returns a tuple of the base noun phrases in the span if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
|
||||
does not permit other NPs to be nested within it – so no NP-level coordination,
|
||||
no prepositional phrases, and no relative clauses.
|
||||
|
||||
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
||||
has not been implemeted for the given language, a `NotImplementedError` is
|
||||
|
@ -298,9 +300,9 @@ raised.
|
|||
> assert chunks[0].text == "another phrase"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------- |
|
||||
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------- |
|
||||
| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
|
||||
|
||||
## Span.as_doc {id="as_doc",tag="method"}
|
||||
|
||||
|
@ -522,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
|
|||
|
||||
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
|
||||
|
||||
Returns a generator over the sentences the span belongs to. This property is
|
||||
only available when [sentence boundaries](/usage/linguistic-features#sbd) have
|
||||
been set on the document by the `parser`, `senter`, `sentencizer` or some custom
|
||||
Returns a tuple of the sentences the span belongs to. This property is only
|
||||
available when [sentence boundaries](/usage/linguistic-features#sbd) have been
|
||||
set on the document by the `parser`, `senter`, `sentencizer` or some custom
|
||||
function. It will raise an error otherwise.
|
||||
|
||||
If the span happens to cross sentence boundaries, all sentences the span
|
||||
|
@ -538,9 +540,9 @@ overlaps with will be returned.
|
|||
> assert len(span.sents) == 2
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------- |
|
||||
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
|
||||
|
||||
## Attributes {id="attributes"}
|
||||
|
||||
|
|
|
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
|
|||
integer IDs. This ensures that strings always map to the same ID, even from
|
||||
different `StringStores`.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Note that a `StringStore` instance is not static. It increases in size as texts
|
||||
with new tokens are processed.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
Create the `StringStore`.
|
||||
|
|
|
@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both
|
|||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
|
||||
|
||||
Performs an update of the student pipe's model using the student's distillation
|
||||
examples and sets the annotations of the teacher's distillation examples using
|
||||
the teacher pipe.
|
||||
|
||||
Unlike other trainable pipes, the student pipe doesn't directly learn its
|
||||
representations from the teacher. However, since downstream pipes that do
|
||||
perform distillation expect the tok2vec annotations to be present on the
|
||||
correct distillation examples, we need to ensure that they are set beforehand.
|
||||
|
||||
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||
same orthography. Even though the reference does not need have to have gold
|
||||
annotations, the teacher could adds its own annotations when necessary.
|
||||
|
||||
This feature is experimental.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> teacher_pipe = teacher.add_pipe("tok2vec")
|
||||
> student_pipe = student.add_pipe("tok2vec")
|
||||
> optimizer = nlp.resume_training()
|
||||
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ |
|
||||
| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | Dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Tok2Vec.pipe {id="pipe",tag="method"}
|
||||
|
||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||
|
|
|
@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="dep", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
|
||||
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
||||
|
||||
|
|
|
@ -52,7 +52,6 @@ modified later.
|
|||
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
|
||||
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
|
||||
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
||||
| `name` | A name to identify the vectors table. ~~str~~ |
|
||||
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
|
||||
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
|
||||
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |
|
||||
|
|
|
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
|
|||
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
||||
between `Doc` objects.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||
new tokens are processed.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
Create the vocabulary.
|
||||
|
@ -17,17 +24,17 @@ Create the vocabulary.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.strings import StringStore
|
||||
> from spacy.vocab import Vocab
|
||||
> vocab = Vocab(strings=["hello", "world"])
|
||||
> vocab = Vocab(strings=StringStore(["hello", "world"]))
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
|
||||
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
|
||||
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~ |
|
||||
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
||||
| `vectors_name` | A name to identify the vectors table. ~~str~~ |
|
||||
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@ menu:
|
|||
## Package naming conventions {id="conventions"}
|
||||
|
||||
In general, spaCy expects all pipeline packages to follow the naming convention
|
||||
of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
|
||||
into three components:
|
||||
of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
|
||||
three components:
|
||||
|
||||
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
||||
tagging, parsing, lemmatization and named entity recognition, or `dep` for
|
||||
|
|
|
@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
|||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
To make them compact and fast, spaCy's small [pipeline packages](/models) (all
|
||||
packages that end in `sm`) **don't ship with word vectors**, and only include
|
||||
context-sensitive **tensors**. This means you can still use the `similarity()`
|
||||
methods to compare documents, spans and tokens – but the result won't be as
|
||||
good, and individual tokens won't have any vectors assigned. So in order to use
|
||||
_real_ word vectors, you need to download a larger pipeline package:
|
||||
packages that end in `sm`) **don't ship with word vectors**. In order to use
|
||||
`similarity()`, you need to download a larger pipeline package that includes
|
||||
vectors:
|
||||
|
||||
```diff
|
||||
- python -m spacy download en_core_web_sm
|
||||
+ python -m spacy download en_core_web_lg
|
||||
+ python -m spacy download en_core_web_md
|
||||
```
|
||||
|
||||
In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
|
||||
backing off to context-sensitive tensors from the `tok2vec` component. These
|
||||
tensors do not work well for this purpose and this backoff has been removed in
|
||||
spaCy v4.
|
||||
|
||||
</Infobox>
|
||||
|
||||
Pipeline packages that come with built-in word vectors make them available as
|
||||
|
|
|
@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
|
||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||
|
||||
|
@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
|
|||
|
||||
The saved pipeline now includes the `"entity_ruler"` in its
|
||||
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
|
||||
file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
|
||||
all pipeline components will be restored and deserialized – including the entity
|
||||
file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
|
||||
pipeline components will be restored and deserialized – including the entity
|
||||
ruler. This lets you ship powerful pipeline packages with binary weights _and_
|
||||
rules included!
|
||||
|
||||
|
|
|
@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
|
|||
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
|
||||
as explained in the [docs](/api/textcategorizer#assigned-attributes).
|
||||
|
||||
### Using the default knowledge base
|
||||
|
||||
As `KnowledgeBase` is now an abstract class, you should call the constructor of
|
||||
the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
|
||||
implementation:
|
||||
|
||||
```diff
|
||||
- kb = KnowledgeBase()
|
||||
+ kb = InMemoryLookupKB()
|
||||
```
|
||||
|
||||
If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
|
||||
implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
|
||||
instead.
|
||||
|
||||
### Updated scorers for tokenization and textcat {id="scores"}
|
||||
|
||||
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
|
||||
|
|
|
@ -58,12 +58,12 @@ arcs.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Argument | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| Argument | Description |
|
||||
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
|
||||
For a list of all available options, see the
|
||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||
|
|
Loading…
Reference in New Issue
Block a user