Merge branch 'refactor/el-candidates' into refactor/span-group-for-mentions

# Conflicts:
#	spacy/pipeline/entity_linker.py
This commit is contained in:
Raphael Mitsch 2023-03-07 09:10:10 +01:00
commit 86703da8b7
47 changed files with 648 additions and 308 deletions

View File

@ -69,6 +69,11 @@ steps:
# displayName: 'Test skip re-download (#12188)'
# condition: eq(variables['python_version'], '3.8')
# - script: |
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
# displayName: 'Test download_url in info CLI'
# condition: eq(variables['python_version'] '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'

View File

@ -16,7 +16,7 @@ jobs:
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
- run: pip install black
- run: pip install black -c requirements.txt
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit

View File

@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
Python modules. If you've built spaCy from source, you'll already have both
tools installed.
As a general rule of thumb, we use f-strings for any formatting of strings.
One exception are calls to Python's `logging` functionality.
To avoid unnecessary string conversions in these cases, we use string formatting
templates with `%s` and `%d` etc.
**⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**

View File

@ -41,7 +41,7 @@ jobs:
inputs:
versionSpec: "3.8"
- script: |
pip install black==22.3.0
pip install black -c requirements.txt
python -m black spacy --check
displayName: "black"
- script: |

View File

@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
mypy>=0.990,<0.1000; platform_machine != "aarch64"
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black>=22.0,<23.0
black==22.3.0

View File

@ -90,9 +90,9 @@ def parse_config_overrides(
cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides]
logger.debug(f"Config overrides from CLI: {keys}")
logger.debug("Config overrides from CLI: %s", keys)
if env_overrides:
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
logger.debug("Config overrides from env variables: %s", list(env_overrides))
return {**cli_overrides, **env_overrides}

View File

@ -1,10 +1,10 @@
from typing import Optional, Dict, Any, Union, List
import platform
import pkg_resources
import json
from pathlib import Path
from wasabi import Printer, MarkdownRenderer
import srsly
import importlib.metadata
from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
dist-info available.
"""
try:
dist = pkg_resources.get_distribution(model)
data = json.loads(dist.get_metadata("direct_url.json"))
return data["url"]
except pkg_resources.DistributionNotFound:
# no such package
return None
dist = importlib.metadata.distribution(model)
text = dist.read_text("direct_url.json")
if isinstance(text, str):
data = json.loads(text)
return data["url"]
except Exception:
# something else, like no file or invalid JSON
return None
pass
return None
def info_model_url(model: str) -> Dict[str, Any]:

View File

@ -252,7 +252,7 @@ def get_third_party_dependencies(
raise regerr from None
module_name = func_info.get("module") # type: ignore[attr-defined]
if module_name: # the code is part of a module, not a --code file
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
dependencies = []
for module_name in modules:
if module_name in distributions:

View File

@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
logger.debug(f"CMD: {cmd['name']}.")
logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
f"URL: {url} for {output_path} with command hash {cmd_hash}"
"URL: %s for %s with command hash %s",
url,
output_path,
cmd_hash,
)
yield url, output_path
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
commands.pop(i)
break
else:
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else:
# If we didn't break the for loop, break the while loop.
break

View File

@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
logger.debug(f"CMD: cmd['name']")
logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
logger.debug(f"CMD_HASH: {cmd_hash}")
logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
content_hash=get_content_hash(output_loc),
)
logger.debug(
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
)
yield output_path, url

View File

@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path
from pathlib import Path
import pkg_resources
from wasabi import msg
from wasabi.util import locale_escape
import sys
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
exist.
"""
import pkg_resources
failed_pkgs_msgs: List[str] = []
conflicting_pkgs_msgs: List[str] = []

View File

@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes):
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
"`kb.add_entity` and `kb.add_alias` to add entries.")
E139 = ("Knowledge base for component '{name}' is empty.")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the "
@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_switch_port=True` to pick an available port automatically.")
"or use `auto_select_port=True` to pick an available port automatically.")
# v4 error strings
E4000 = ("Expected a Doc as input, but got: '{type}'")
@ -961,6 +960,7 @@ class Errors(metaclass=ErrorsWithCodes):
E4003 = ("Training examples for distillation must have the exact same tokens in the "
"reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.")
E4005 = ("Expected `entity_id` to be of type {should_type}, but is of type {is_type}.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

View File

@ -1,12 +1,14 @@
import abc
from typing import List, Union, Callable
from ..errors import Errors
class Candidate(abc.ABC):
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
"""A `Candidate` object refers to a textual mention that may or may not be resolved
to a specific `entity_id` from a Knowledge Base. This will be used as input for the entity_id linking
algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity_id) pair is assigned a certain prior probability.
Each candidate (mention, entity_id) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init
"""
@ -14,15 +16,13 @@ class Candidate(abc.ABC):
def __init__(
self,
mention: str,
entity_id: int,
entity_name: str,
entity_id: Union[str, int],
entity_vector: List[float],
prior_prob: float,
):
"""Initializes properties of `Candidate` instance.
mention (str): Mention text for this candidate.
entity_id (int): Unique entity ID.
entity_name (str): Entity name.
entity_id (Union[str, int]): Unique entity ID.
entity_vector (List[float]): Entity embedding.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
@ -31,19 +31,26 @@ class Candidate(abc.ABC):
"""
self._mention = mention
self._entity_id = entity_id
self._entity_name = entity_name
# Note that hashing an int value yields the same int value.
self._entity_id_hash = hash(entity_id)
self._entity_vector = entity_vector
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): Unique entity ID."""
def entity_id(self) -> Union[str, int]:
"""RETURNS (Union[str, int]): Unique entity ID."""
return self._entity_id
@property
def entity_(self) -> str:
"""RETURNS (int): Entity name."""
return self._entity_name
def entity_id_int(self) -> int:
"""RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID,
otherwise the hash of the entity ID string)."""
return self._entity_id_hash
@property
def entity_id_str(self) -> str:
"""RETURNS (str): String representation of entity ID."""
return str(self._entity_id)
@property
def mention(self) -> str:
@ -66,53 +73,44 @@ class InMemoryCandidate(Candidate):
def __init__(
self,
retrieve_string_from_hash: Callable[[int], str],
entity_hash: int,
entity_freq: int,
hash_to_str: Callable[[int], str],
entity_id: int,
mention: str,
entity_vector: List[float],
alias_hash: int,
prior_prob: float,
entity_freq: int,
):
"""
retrieve_string_from_hash (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab
hash.
entity_hash (str): Hashed entity name /ID.
hash_to_str (Callable[[int], str]): Callable retrieving entity name from provided entity/vocab hash.
entity_id (str): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__().
entity_freq (int): Entity frequency in KB corpus.
entity_vector (List[float]): Entity embedding.
alias_hash (int): Hashed alias.
mention (str): Mention.
prior_prob (float): Prior probability of entity for this mention - i.e. the probability that, independent of
the context, this mention resolves to this entity_id in the corpus used to build the knowledge base. In
cases in which this isn't always possible (e.g.: the corpus to analyse contains mentions that the KB corpus
doesn't) it might be better to eschew this information and always supply the same value.
"""
super().__init__(
mention=retrieve_string_from_hash(alias_hash),
entity_id=entity_hash,
entity_name=retrieve_string_from_hash(entity_hash),
mention=mention,
entity_id=entity_id,
entity_vector=entity_vector,
prior_prob=prior_prob,
)
self._retrieve_string_from_hash = retrieve_string_from_hash
self._entity_hash = entity_hash
self._hash_to_str = hash_to_str
self._entity_freq = entity_freq
self._alias_hash = alias_hash
self._prior_prob = prior_prob
@property
def entity(self) -> int:
"""RETURNS (int): hash of the entity_id's KB ID/name"""
return self._entity_hash
@property
def alias(self) -> int:
"""RETURNS (int): hash of the alias"""
return self._alias_hash
@property
def alias_(self) -> str:
"""RETURNS (str): ID of the original alias"""
return self._retrieve_string_from_hash(self._alias_hash)
if not isinstance(self._entity_id, int):
raise ValueError(
Errors.E4005.format(should_type="int", is_type=str(type(entity_id)))
)
self._entity_id_str = self._hash_to_str(self._entity_id)
@property
def entity_freq(self) -> float:
"""RETURNS (float): Relative entity frequency."""
return self._entity_freq
@property
def entity_id_str(self) -> str:
"""RETURNS (str): String representation of entity ID."""
return self._entity_id_str

View File

@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
def is_empty(self):
return len(self) == 0
def __len__(self):
return self.get_size_entities()
@ -224,9 +227,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._aliases_table[alias_index] = alias_entry
def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]:
return self.get_alias_candidates(mention.text) # type: ignore
return self._get_alias_candidates(mention.text) # type: ignore
def get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
@ -240,12 +243,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return [
InMemoryCandidate(
retrieve_string_from_hash=self.vocab.strings.__getitem__,
entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq,
hash_to_str=self.vocab.strings.__getitem__,
entity_id=self._entries[entry_index].entity_hash,
mention=alias,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
alias_hash=alias_hash,
prior_prob=prior_prob
prior_prob=prior_prob,
entity_freq=self._entries[entry_index].freq
)
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0

View File

@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
class SwedishDefaults(BaseDefaults):

View File

@ -0,0 +1,33 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
]
)
_suffixes = [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "s", "S", r"\'"]
]
_suffixes += [r"(?<=[^sSxXzZ])\'"]
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables)
return lookups
@ -2072,7 +2072,7 @@ class Language:
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update

View File

@ -82,8 +82,12 @@ cdef class DependencyMatcher:
"$-": self._imm_left_sib,
"$++": self._right_sib,
"$--": self._left_sib,
">+": self._imm_right_child,
">-": self._imm_left_child,
">++": self._right_child,
">--": self._left_child,
"<+": self._imm_right_parent,
"<-": self._imm_left_parent,
"<++": self._right_parent,
"<--": self._left_parent,
}
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
def _left_sib(self, doc, node):
return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _imm_right_child(self, doc, node):
for child in doc[node].children:
if child.i == node + 1:
return [doc[child.i]]
return []
def _imm_left_child(self, doc, node):
for child in doc[node].children:
if child.i == node - 1:
return [doc[child.i]]
return []
def _right_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i > node]
def _left_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i < node]
def _imm_right_parent(self, doc, node):
if doc[node].head.i == node + 1:
return [doc[node].head]
return []
def _imm_left_parent(self, doc, node):
if doc[node].head.i == node - 1:
return [doc[node].head]
return []
def _right_parent(self, doc, node):
if doc[node].head.i > node:
return [doc[node].head]

View File

@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
return attr_values
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
# tuple order affects performance
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
# These predicate helper classes are used to match the REGEX, IN, >= etc
# extensions to the matcher introduced in #3173.
@ -848,7 +853,7 @@ class _FuzzyPredicate:
fuzz = self.predicate[len("FUZZY"):] # number after prefix
self.fuzzy = int(fuzz) if fuzz else -1
self.fuzzy_compare = fuzzy_compare
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
def __call__(self, Token token):
if self.is_extension:
@ -870,7 +875,7 @@ class _RegexPredicate:
self.value = re.compile(value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
self.key = _predicate_cache_key(self.attr, self.predicate, value)
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -906,7 +911,7 @@ class _SetPredicate:
self.value = set(get_string_id(v) for v in value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -978,7 +983,7 @@ class _ComparisonPredicate:
self.value = value
self.predicate = predicate
self.is_extension = is_extension
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
self.key = _predicate_cache_key(self.attr, self.predicate, value)
if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
if isinstance(value, dict):
for type_, cls in predicate_types.items():
if type_ in value:
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
key = _predicate_cache_key(attr, type_, value[type_])
if key in seen_predicates:
output.append(seen_predicates[key])
else:

View File

@ -89,6 +89,14 @@ def load_kb(
return kb_from_file
@registry.misc("spacy.EmptyKB.v2")
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@registry.misc("spacy.EmptyKB.v1")
def empty_kb(
entity_vector_length: int,

View File

@ -9,7 +9,6 @@ import random
from thinc.api import CosineDistance, Model, Optimizer, Config
from thinc.api import set_dropout_rate
from ..tokens import SpanGroup
from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb
from ..tokens import Doc, Span, SpanGroup
@ -58,6 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
"overwrite": False,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
@ -85,6 +85,7 @@ def make_entity_linker(
get_candidates_batch: Callable[
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
@ -107,6 +108,7 @@ def make_entity_linker(
get_candidates_batch (
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
@ -148,6 +150,7 @@ def make_entity_linker(
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
get_candidates_batch=get_candidates_batch,
generate_empty_kb=generate_empty_kb,
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
@ -189,6 +192,7 @@ class EntityLinker(TrainablePipe):
get_candidates_batch: Callable[
[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool = False,
scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool,
@ -212,7 +216,8 @@ class EntityLinker(TrainablePipe):
get_candidates_batch (
Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
overwrite (bool): Whether to overwrite existing non-empty annotations.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
@ -220,6 +225,7 @@ class EntityLinker(TrainablePipe):
candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/entitylinker#init
"""
@ -236,6 +242,7 @@ class EntityLinker(TrainablePipe):
self.model = model
self.name = name
self.labels_discard = list(labels_discard)
# how many neighbour sentences to take into account
self.n_sents = n_sents
self.incl_prior = incl_prior
self.incl_context = incl_context
@ -243,9 +250,7 @@ class EntityLinker(TrainablePipe):
self.get_candidates_batch = get_candidates_batch
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account
# create an empty KB by default
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
self.scorer = scorer
self.use_gold_ents = use_gold_ents
self.candidates_batch_size = candidates_batch_size
@ -267,7 +272,7 @@ class EntityLinker(TrainablePipe):
# Raise an error if the knowledge base is not initialized.
if self.kb is None:
raise ValueError(Errors.E1018.format(name=self.name))
if len(self.kb) == 0:
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
raise ValueError(Errors.E139.format(name=self.name))
def initialize(
@ -537,12 +542,12 @@ class EntityLinker(TrainablePipe):
)
elif len(candidates) == 1 and self.threshold is None:
# shortcut for efficiency reasons: take the 1 candidate
final_kb_ids.append(candidates[0].entity_)
final_kb_ids.append(candidates[0].entity_id_str)
self._add_activations(
doc_scores=doc_scores,
doc_ents=doc_ents,
scores=[1.0],
ents=[candidates[0].entity],
ents=[candidates[0].entity_id_int],
)
else:
random.shuffle(candidates)
@ -572,7 +577,7 @@ class EntityLinker(TrainablePipe):
raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims)
final_kb_ids.append(
candidates[scores.argmax().item()].entity_
candidates[scores.argmax().item()].entity_id_str
if self.threshold is None
or scores.max() >= self.threshold
else EntityLinker.NIL
@ -581,7 +586,7 @@ class EntityLinker(TrainablePipe):
doc_scores=doc_scores,
doc_ents=doc_ents,
scores=scores,
ents=[c.entity for c in candidates],
ents=[c.entity_id_int for c in candidates],
)
self._add_doc_activations(
docs_scores=docs_scores,

View File

@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.issue(12311)
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 1

View File

@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
("the", "brown", "$--", 0),
("brown", "the", "$--", 1),
("brown", "brown", "$--", 0),
("over", "jumped", "<+", 0),
("quick", "fox", "<+", 0),
("the", "quick", "<+", 0),
("brown", "fox", "<+", 1),
("quick", "fox", "<++", 1),
("quick", "over", "<++", 0),
("over", "jumped", "<++", 0),
("the", "fox", "<++", 2),
("brown", "fox", "<-", 0),
("fox", "over", "<-", 0),
("the", "over", "<-", 0),
("over", "jumped", "<-", 1),
("brown", "fox", "<--", 0),
("fox", "jumped", "<--", 0),
("fox", "over", "<--", 1),
("fox", "brown", ">+", 0),
("over", "fox", ">+", 0),
("over", "the", ">+", 0),
("jumped", "over", ">+", 1),
("jumped", "over", ">++", 1),
("fox", "lazy", ">++", 0),
("over", "the", ">++", 0),
("jumped", "over", ">-", 0),
("fox", "quick", ">-", 0),
("brown", "quick", ">-", 0),
("fox", "brown", ">-", 1),
("brown", "fox", ">--", 0),
("fox", "brown", ">--", 1),
("jumped", "fox", ">--", 1),

View File

@ -353,6 +353,9 @@ def test_kb_default(nlp):
"""Test that the default (empty) KB is loaded upon construction"""
entity_linker = nlp.add_pipe("entity_linker", config={})
assert len(entity_linker.kb) == 0
with pytest.raises(ValueError, match="E139"):
# this raises an error because the KB is empty
entity_linker.validate_kb()
assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0
# 64 is the default value from pipeline.entity_linker
@ -468,8 +471,8 @@ def test_candidate_generation(nlp):
assert len(get_candidates(mykb, shrubbery_ent)) == 0
# test the content of the candidates
assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2"
assert get_candidates(mykb, adam_ent)[0].alias_ == "adam"
assert get_candidates(mykb, adam_ent)[0].entity_id_str == "Q2"
assert get_candidates(mykb, adam_ent)[0].mention == "adam"
assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12)
assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
@ -499,7 +502,7 @@ def test_el_pipe_configuration(nlp):
assert doc[2].ent_kb_id_ == "Q2"
def get_lowercased_candidates(kb, span):
return kb.get_alias_candidates(span.text.lower())
return kb._get_alias_candidates(span.text.lower())
def get_lowercased_candidates_batch(kb, spans):
return [get_lowercased_candidates(kb, span) for span in spans]
@ -558,24 +561,22 @@ def test_vocab_serialization(nlp):
mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
candidates = mykb.get_alias_candidates("adam")
candidates = mykb._get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
assert candidates[0].entity_id_int == q2_hash
assert candidates[0].entity_id_str == "Q2"
assert candidates[0].mention == "adam"
with make_tempdir() as d:
mykb.to_disk(d / "kb")
kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
candidates = kb_new_vocab._get_alias_candidates("adam")
assert len(candidates) == 1
assert candidates[0].entity == q2_hash
assert candidates[0].entity_ == "Q2"
assert candidates[0].alias == adam_hash
assert candidates[0].alias_ == "adam"
assert candidates[0].entity_id_int == q2_hash
assert candidates[0].entity_id_str == "Q2"
assert candidates[0].mention == "adam"
assert kb_new_vocab.get_vector("Q2") == [2]
assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4)
@ -595,20 +596,20 @@ def test_append_alias(nlp):
mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
# test the size of the relevant candidates
assert len(mykb.get_alias_candidates("douglas")) == 2
assert len(mykb._get_alias_candidates("douglas")) == 2
# append an alias
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
# test the size of the relevant candidates has been incremented
assert len(mykb.get_alias_candidates("douglas")) == 3
assert len(mykb._get_alias_candidates("douglas")) == 3
# append the same alias-entity pair again should not work (will throw a warning)
with pytest.warns(UserWarning):
mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
# test the size of the relevant candidates remained unchanged
assert len(mykb.get_alias_candidates("douglas")) == 3
assert len(mykb._get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036")
@ -905,11 +906,11 @@ def test_kb_to_bytes():
assert kb_2.contains_alias("Russ Cochran")
assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
kb_2.get_alias_candidates("Russ Cochran")
assert len(kb_1._get_alias_candidates("Russ Cochran")) == len(
kb_2._get_alias_candidates("Russ Cochran")
)
assert len(kb_1.get_alias_candidates("Randomness")) == len(
kb_2.get_alias_candidates("Randomness")
assert len(kb_1._get_alias_candidates("Randomness")) == len(
kb_2._get_alias_candidates("Randomness")
)

View File

@ -1,7 +1,10 @@
from typing import Callable
from pathlib import Path
from typing import Callable, Iterable, Any, Dict
from spacy import util
from spacy.util import ensure_path, registry, load_model_from_config
import srsly
from spacy import util, Errors
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
from spacy.kb.kb_in_memory import InMemoryLookupKB
from spacy.vocab import Vocab
from thinc.api import Config
@ -63,19 +66,21 @@ def _check_kb(kb):
assert alias_string not in kb.get_alias_strings()
# check candidates & probabilities
candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_)
candidates = sorted(
kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_str
)
assert len(candidates) == 2
assert candidates[0].entity_ == "Q007"
assert candidates[0].entity_id_str == "Q007"
assert 6.999 < candidates[0].entity_freq < 7.01
assert candidates[0].entity_vector == [0, 0, 7]
assert candidates[0].alias_ == "double07"
assert candidates[0].mention == "double07"
assert 0.899 < candidates[0].prior_prob < 0.901
assert candidates[1].entity_ == "Q17"
assert candidates[1].entity_id_str == "Q17"
assert 1.99 < candidates[1].entity_freq < 2.01
assert candidates[1].entity_vector == [7, 1, 0]
assert candidates[1].alias_ == "double07"
assert candidates[1].mention == "double07"
assert 0.099 < candidates[1].prior_prob < 0.101
@ -91,7 +96,10 @@ def test_serialize_subclassed_kb():
[components.entity_linker]
factory = "entity_linker"
[components.entity_linker.generate_empty_kb]
@misc = "kb_test.CustomEmptyKB.v1"
[initialize]
[initialize.components]
@ -99,7 +107,7 @@ def test_serialize_subclassed_kb():
[initialize.components.entity_linker]
[initialize.components.entity_linker.kb_loader]
@misc = "spacy.CustomKB.v1"
@misc = "kb_test.CustomKB.v1"
entity_vector_length = 342
custom_field = 666
"""
@ -109,10 +117,57 @@ def test_serialize_subclassed_kb():
super().__init__(vocab, entity_vector_length)
self.custom_field = custom_field
@registry.misc("spacy.CustomKB.v1")
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
path = ensure_path(path)
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
def serialize_custom_fields(file_path: Path) -> None:
srsly.write_json(file_path, {"custom_field": self.custom_field})
serialize = {
"contents": lambda p: self.write_contents(p),
"strings.json": lambda p: self.vocab.strings.to_disk(p),
"custom_fields": lambda p: serialize_custom_fields(p),
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
path = ensure_path(path)
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
def deserialize_custom_fields(file_path: Path) -> None:
self.custom_field = srsly.read_json(file_path)["custom_field"]
deserialize: Dict[str, Callable[[Any], Any]] = {
"contents": lambda p: self.read_contents(p),
"strings.json": lambda p: self.vocab.strings.from_disk(p),
"custom_fields": lambda p: deserialize_custom_fields(p),
}
util.from_disk(path, deserialize, exclude)
@registry.misc("kb_test.CustomEmptyKB.v1")
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return SubInMemoryLookupKB(
vocab=vocab,
entity_vector_length=entity_vector_length,
custom_field=0,
)
return empty_kb_factory
@registry.misc("kb_test.CustomKB.v1")
def custom_kb(
entity_vector_length: int, custom_field: int
) -> Callable[[Vocab], InMemoryLookupKB]:
) -> Callable[[Vocab], SubInMemoryLookupKB]:
def custom_kb_factory(vocab):
kb = SubInMemoryLookupKB(
vocab=vocab,
@ -139,6 +194,6 @@ def test_serialize_subclassed_kb():
nlp2 = util.load_model_from_path(tmp_dir)
entity_linker2 = nlp2.get_pipe("entity_linker")
# After IO, the KB is the standard one
assert type(entity_linker2.kb) == InMemoryLookupKB
assert type(entity_linker2.kb) == SubInMemoryLookupKB
assert entity_linker2.kb.entity_vector_length == 342
assert not hasattr(entity_linker2.kb, "custom_field")
assert entity_linker2.kb.custom_field == 666

View File

@ -2,7 +2,6 @@ import os
import math
from collections import Counter
from typing import Tuple, List, Dict, Any
import pkg_resources
import time
from pathlib import Path
@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys):
)
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@pytest.mark.parametrize(
"reqs,output",
[
@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys):
],
)
def test_project_check_requirements(reqs, output):
import pkg_resources
# excessive guard against unlikely package name
try:
pkg_resources.require("spacyunknowndoesnotexist12345")

View File

@ -1,5 +1,7 @@
import os
from pathlib import Path
import pytest
import srsly
from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc
@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
# Instead of checking specific wording of the output, which may change,
# we'll check that this section of the debug output is present.
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
# project tests
SAMPLE_PROJECT = {
"title": "Sample project",
"description": "This is a project for testing",
"assets": [
{
"dest": "assets/spacy-readme.md",
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
},
{
"dest": "assets/citation.cff",
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
"checksum": "c996bfd80202d480eb2e592369714e5e",
"extra": True,
},
],
"commands": [
{
"name": "ok",
"help": "print ok",
"script": ["python -c \"print('okokok')\""],
},
{
"name": "create",
"help": "make a file",
"script": ["touch abc.txt"],
"outputs": ["abc.txt"],
},
{
"name": "clean",
"help": "remove test file",
"script": ["rm abc.txt"],
},
],
}
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
@pytest.fixture
def project_dir():
with make_tempdir() as pdir:
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
yield pdir
def test_project_document(project_dir):
readme_path = project_dir / "README.md"
assert not readme_path.exists(), "README already exists"
result = CliRunner().invoke(
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
)
assert result.exit_code == 0
assert readme_path.is_file()
text = readme_path.read_text("utf-8")
assert SAMPLE_PROJECT["description"] in text
def test_project_assets(project_dir):
asset_dir = project_dir / "assets"
assert not asset_dir.exists(), "Assets dir is already present"
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
assert result.exit_code == 0
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
# check that extras work
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
assert result.exit_code == 0
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
def test_project_run(project_dir):
# make sure dry run works
test_file = project_dir / "abc.txt"
result = CliRunner().invoke(
app, ["project", "run", "--dry", "create", str(project_dir)]
)
assert result.exit_code == 0
assert not test_file.is_file()
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
assert result.exit_code == 0
assert "okokok" in result.stdout
@pytest.mark.parametrize(
"options",
[
"",
# "--sparse",
"--branch v3",
"--repo https://github.com/explosion/projects --branch v3",
],
)
def test_project_clone(options):
with make_tempdir() as workspace:
out = workspace / "project"
target = "benchmarks/ner_conll03"
if not options:
options = []
else:
options = options.split()
result = CliRunner().invoke(
app, ["project", "clone", target, *options, str(out)]
)
assert result.exit_code == 0
assert (out / "README.md").is_file()
def test_project_push_pull(project_dir):
proj = dict(SAMPLE_PROJECT)
remote = "xyz"
with make_tempdir() as remote_dir:
proj["remotes"] = {remote: str(remote_dir)}
proj_text = srsly.yaml_dumps(proj)
(project_dir / "project.yml").write_text(proj_text)
test_file = project_dir / "abc.txt"
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
assert result.exit_code == 0
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
assert result.exit_code == 0
assert not test_file.exists()
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()

View File

@ -98,7 +98,7 @@ def assert_sents_error(doc):
def warn_error(proc_name, proc, docs, e):
logger = logging.getLogger("spacy")
logger.warning(f"Trouble with component {proc_name}.")
logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture

View File

@ -131,9 +131,9 @@ class Doc:
default: str = ...,
) -> None: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
def noun_chunks(self) -> Tuple[Span]: ...
@property
def sents(self) -> Iterator[Span]: ...
def sents(self) -> Tuple[Span]: ...
@property
def lang(self) -> int: ...
@property

View File

@ -703,10 +703,10 @@ cdef class Doc:
return self.text
property ents:
"""The named entities in the document. Returns a tuple of named entity
"""The named entities in the document. Returns a list of named entity
`Span` objects, if the entity recognizer has been applied.
RETURNS (tuple): Entities in the document, one `Span` per entity.
RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
DOCS: https://spacy.io/api/doc#ents
"""
@ -864,7 +864,7 @@ cdef class Doc:
NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the document.
RETURNS (Tuple[Span]): Noun chunks in the document.
DOCS: https://spacy.io/api/doc#noun_chunks
"""
@ -873,36 +873,35 @@ cdef class Doc:
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenization from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
# its tokenization changing, so it's okay once we have the Span
# objects. See Issue #375.
# during the iteration.
spans = []
for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
for span in spans:
yield span
return tuple(spans)
@property
def sents(self):
"""Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label.
YIELDS (Span): Sentences in the document.
RETURNS (Tuple[Span]): Sentences in the document.
DOCS: https://spacy.io/api/doc#sents
"""
if not self.has_annotation("SENT_START"):
raise ValueError(Errors.E030)
if "sents" in self.user_hooks:
yield from self.user_hooks["sents"](self)
return tuple(self.user_hooks["sents"](self))
else:
start = 0
spans = []
for i in range(1, self.length):
if self.c[i].sent_start == 1:
yield Span(self, start, i)
spans.append(Span(self, start, i))
start = i
if start != self.length:
yield Span(self, start, self.length)
spans.append(Span(self, start, self.length))
return tuple(spans)
@property
def lang(self):

View File

@ -74,6 +74,8 @@ class Span:
@property
def ents(self) -> Tuple[Span]: ...
@property
def sents(self) -> Tuple[Span]: ...
@property
def has_vector(self) -> bool: ...
@property
def vector(self) -> Floats1d: ...
@ -86,7 +88,7 @@ class Span:
@property
def text_with_ws(self) -> str: ...
@property
def noun_chunks(self) -> Iterator[Span]: ...
def noun_chunks(self) -> Tuple[Span]: ...
@property
def root(self) -> Token: ...
def char_span(

View File

@ -461,20 +461,21 @@ cdef class Span:
"""Obtain the sentences that contain this span. If the given span
crosses sentence boundaries, return all sentences it is a part of.
RETURNS (Iterable[Span]): All sentences that the span is a part of.
RETURNS (Tuple[Span]): All sentences that the span is a part of.
DOCS: https://spacy.io/api/span#sents
DOCS: https://spacy.io/api/span#sents
"""
cdef int start
cdef int i
if "sents" in self.doc.user_span_hooks:
yield from self.doc.user_span_hooks["sents"](self)
elif "sents" in self.doc.user_hooks:
return tuple(self.doc.user_span_hooks["sents"](self))
spans = []
if "sents" in self.doc.user_hooks:
for sentence in self.doc.user_hooks["sents"](self.doc):
if sentence.end > self.start:
if sentence.start < self.end or sentence.start == self.start == self.end:
yield sentence
spans.append(sentence)
else:
break
else:
@ -489,12 +490,13 @@ cdef class Span:
# Now, find all the sentences in the span
for i in range(start + 1, self.doc.length):
if self.doc.c[i].sent_start == 1:
yield Span(self.doc, start, i)
spans.append(Span(self.doc, start, i))
start = i
if start >= self.end:
break
if start < self.end:
yield Span(self.doc, start, self.end)
spans.append(Span(self.doc, start, self.end))
return tuple(spans)
@property
@ -502,7 +504,7 @@ cdef class Span:
"""The named entities that fall completely within the span. Returns
a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity.
RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
DOCS: https://spacy.io/api/span#ents
"""
@ -517,7 +519,7 @@ cdef class Span:
ents.append(ent)
else:
break
return ents
return tuple(ents)
@property
def has_vector(self):
@ -613,13 +615,15 @@ cdef class Span:
NP-level coordination, no prepositional phrases, and no relative
clauses.
YIELDS (Span): Noun chunks in the span.
RETURNS (Tuple[Span]): Noun chunks in the span.
DOCS: https://spacy.io/api/span#noun_chunks
"""
spans = []
for span in self.doc.noun_chunks:
if span.start >= self.start and span.end <= self.end:
yield span
spans.append(span)
return tuple(spans)
@property
def root(self):

View File

@ -11,7 +11,7 @@ def create_copy_from_base_model(
) -> Callable[[Language], Language]:
def copy_from_base_model(nlp):
if tokenizer:
logger.info(f"Copying tokenizer from: {tokenizer}")
logger.info("Copying tokenizer from: %s", tokenizer)
base_nlp = load_model(tokenizer)
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@ -23,7 +23,7 @@ def create_copy_from_base_model(
)
)
if vocab:
logger.info(f"Copying vocab from: {vocab}")
logger.info("Copying vocab from: %s", vocab)
# only reload if the vocab is from a different model
if tokenizer != vocab:
base_nlp = load_model(vocab)

View File

@ -29,7 +29,7 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
util.logger.debug(f"Loading corpus from path: {path}")
util.logger.debug("Loading corpus from path: %s", path)
return Corpus(
path,
gold_preproc=gold_preproc,

View File

@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced if p not in frozen_components]
logger.info(f"Pipeline: {nlp.pipe_names}")
logger.info("Pipeline: %s", nlp.pipe_names)
if resume_components:
with nlp.select_pipes(enable=resume_components):
logger.info(f"Resuming training for: {resume_components}")
logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further
nlp._link_components()
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
if T["max_epochs"] == -1:
sample_size = 100
logger.debug(
f"Due to streamed train corpus, using only first {sample_size} "
f"examples for initialization. If necessary, provide all labels "
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
"Due to streamed train corpus, using only first %s examples for initialization. "
"If necessary, provide all labels in [initialize]. "
"More info: https://spacy.io/api/cli#init_labels",
sample_size,
)
nlp.initialize(
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
for listener in getattr(
@ -109,7 +110,7 @@ def init_vocab(
) -> None:
if lookups:
nlp.vocab.lookups = lookups
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
@ -125,11 +126,11 @@ def init_vocab(
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
logger.info("Created vocabulary")
if vectors is not None:
load_vectors_into_model(nlp, vectors)
logger.info(f"Added vectors: {vectors}")
logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@ -191,7 +192,7 @@ def init_tok2vec(
if weights_data is not None:
layer = get_tok2vec_ref(nlp, P)
layer.from_bytes(weights_data)
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
logger.info("Loaded pretrained weights from %s", init_tok2vec)
return True
return False
@ -215,13 +216,13 @@ def convert_vectors(
nlp.vocab.deduplicate_vectors()
else:
if vectors_loc:
logger.info(f"Reading vectors from {vectors_loc}")
logger.info("Reading vectors from %s", vectors_loc)
vectors_data, vector_keys, floret_settings = read_vectors(
vectors_loc,
truncate,
mode=mode,
)
logger.info(f"Loaded vectors from {vectors_loc}")
logger.info("Loaded vectors from %s", vectors_loc)
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None and mode != VectorsMode.floret:

View File

@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
if subdir.exists():
try:
shutil.rmtree(str(subdir))
logger.debug(f"Removed existing output directory: {subdir}")
logger.debug("Removed existing output directory: %s", subdir)
except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e

View File

@ -33,6 +33,7 @@ import inspect
import pkgutil
import logging
import socket
import stat
try:
import cupy.random
@ -55,7 +56,7 @@ if TYPE_CHECKING:
# fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
@ -139,8 +140,17 @@ class registry(thinc.registry):
return func
@classmethod
def find(cls, registry_name: str, func_name: str) -> Callable:
"""Get info about a registered function from the registry."""
def find(
cls, registry_name: str, func_name: str
) -> Dict[str, Optional[Union[str, int]]]:
"""Find information about a registered function, including the
module and path to the file it's defined in, the line number and the
docstring, if available.
registry_name (str): Name of the catalogue registry.
func_name (str): Name of the registered function.
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
"""
# We're overwriting this classmethod so we're able to provide more
# specific error messages and implement a fallback to spacy-legacy.
if not hasattr(cls, registry_name):
@ -1028,11 +1038,19 @@ def make_tempdir() -> Generator[Path, None, None]:
YIELDS (Path): The path of the temp directory.
"""
d = Path(tempfile.mkdtemp())
yield d
# On Windows, git clones use read-only files, which cause permission errors
# when being deleted. This forcibly fixes permissions.
def force_remove(rmfunc, path, ex):
os.chmod(path, stat.S_IWRITE)
rmfunc(path)
try:
with tempfile.TemporaryDirectory() as td:
yield Path(td)
shutil.rmtree(str(d), onerror=force_remove)
except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=td, msg=e))
warnings.warn(Warnings.W091.format(dir=d, msg=e))
def is_cwd(path: Union[Path, str]) -> bool:

View File

@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.EmptyKB.v1 {id="EmptyKB"}
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
instance. This is the default when a new entity linker component is created.
instance.
| Name | Description |
| ---------------------- | ----------------------------------------------------------------------------------- |
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
### spacy.EmptyKB.v2 {id="EmptyKB"}
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
instance. This is the default when a new entity linker component is created. It
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
### spacy.KBFromFile.v1 {id="KBFromFile"}
A function that reads an existing `KnowledgeBase` from file.
@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
`CandidateGenerator` uses the text of a mention to find its potential aliases in
the `KnowledgeBase`. Note that this function is case-dependent.
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
[`Span`](/api/span) objects denoting named entities, and returns a list of
plausible [`Candidate`](/api/kb/#candidate) objects per specified
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
mention to find its potential aliases in the `KnowledgeBase`. Note that this
function is case-dependent.
## Coreference {id="coref-architectures",tag="experimental"}
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to

View File

@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
### project pull {id="project-pull",tag="command"}
Download all files or directories listed as `outputs` for commands, unless they
are not already present locally. When searching for files in the remote, `pull`
are already present locally. When searching for files in the remote, `pull`
won't just look at the output path, but will also consider the **command
string** and the **hashes of the dependencies**. For instance, let's say you've
previously pushed a checkpoint to the remote, but now you've changed some

View File

@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
| Symbol | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------- |
| `A < B` | `A` is the immediate dependent of `B`. |
| `A > B` | `A` is the immediate head of `B`. |
| `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. |
| `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. |
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
| Symbol | Description |
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| `A < B` | `A` is the immediate dependent of `B`. |
| `A > B` | `A` is the immediate head of `B`. |
| `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. |
| `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. |
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}

View File

@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the document. Yields base noun-phrase
`Span` objects, if the document has been syntactically parsed. A base noun
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
nested within it so no NP-level coordination, no prepositional phrases, and no
relative clauses.
Returns a tuple of the base noun phrases in the doc, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
does not permit other NPs to be nested within it so no NP-level coordination,
no prepositional phrases, and no relative clauses.
To customize the noun chunk iterator in a loaded pipeline, modify
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
> assert chunks[1].text == "another phrase"
> ```
| Name | Description |
| ---------- | ------------------------------------- |
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
| Name | Description |
| ----------- | -------------------------------------------- |
| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
## Doc.sents {id="sents",tag="property",model="sentences"}
Iterate over the sentences in the document. Sentence spans have no label.
Returns a tuple of the sentences in the document. Sentence spans have no label.
This property is only available when
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@ -697,9 +696,9 @@ will raise an error otherwise.
> assert [s.root.text for s in sents] == ["is", "'s"]
> ```
| Name | Description |
| ---------- | ----------------------------------- |
| **YIELDS** | Sentences in the document. ~~Span~~ |
| Name | Description |
| ----------- | ------------------------------------------ |
| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
## Doc.has_vector {id="has_vector",tag="property",model="vectors"}

View File

@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config)
> ```
| Setting | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| Setting | Description |
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py

View File

@ -104,23 +104,6 @@ to you.
| `mentions` | The textual mention or alias. ~~SpanGroup~~ |
| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"}
<Infobox variant="warning">
This method is _not_ available from spaCy 3.5 onwards.
</Infobox>
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
allow more flexibility in customizing knowledge bases. Some of its methods were
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
one of those being `get_alias_candidates()`. This method is now available as
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
Note:
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
defaults to
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
Given a certain entity ID, retrieve its pretrained entity vector.
@ -201,26 +184,26 @@ to a certain prior probability.
### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"}
Construct a `InMemoryCandidate` object. Usually this constructor is not called
Construct an `InMemoryCandidate` object. Usually this constructor is not called
directly, but instead these objects are returned by the `get_candidates` method
of the [`entity_linker`](/api/entitylinker) pipe.
> #### Example```python
>
> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
> entity_hash, entity_freq, entity_vector, mention_hash, prior_prob)
>
> ```
>
> ```
| Name | Description |
| ------------- | ------------------------------------------------------------------------- |
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------- |
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `mention_hash` | The hash of the textual mention. ~~int~~ |
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## InMemoryCandidate attributes {id="candidate-attributes"}

View File

@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
> assert ents[0].text == "Mr. Best"
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------------- |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------ |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
objects, if the document has been syntactically parsed. A base noun phrase, or
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
it so no NP-level coordination, no prepositional phrases, and no relative
clauses.
Returns a tuple of the base noun phrases in the span if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
does not permit other NPs to be nested within it so no NP-level coordination,
no prepositional phrases, and no relative clauses.
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
has not been implemeted for the given language, a `NotImplementedError` is
@ -301,9 +300,9 @@ raised.
> assert chunks[0].text == "another phrase"
> ```
| Name | Description |
| ---------- | --------------------------------- |
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
| Name | Description |
| ----------- | ---------------------------------------- |
| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
## Span.as_doc {id="as_doc",tag="method"}
@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
Returns a generator over the sentences the span belongs to. This property is
only available when [sentence boundaries](/usage/linguistic-features#sbd) have
been set on the document by the `parser`, `senter`, `sentencizer` or some custom
Returns a tuple of the sentences the span belongs to. This property is only
available when [sentence boundaries](/usage/linguistic-features#sbd) have been
set on the document by the `parser`, `senter`, `sentencizer` or some custom
function. It will raise an error otherwise.
If the span happens to cross sentence boundaries, all sentences the span
@ -541,9 +540,9 @@ overlaps with will be returned.
> assert len(span.sents) == 2
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------------- |
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------- |
| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
## Attributes {id="attributes"}

View File

@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="dep", options=options)
> ```
| Name | Description |
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
| Name | Description |
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
#### Named Entity Visualizer options {id="displacy_options-ent"}

View File

@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
| Symbol | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------- |
| `A < B` | `A` is the immediate dependent of `B`. |
| `A > B` | `A` is the immediate head of `B`. |
| `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. |
| `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. |
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| Symbol | Description |
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| `A < B` | `A` is the immediate dependent of `B`. |
| `A > B` | `A` is the immediate head of `B`. |
| `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. |
| `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. |
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
The saved pipeline now includes the `"entity_ruler"` in its
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
all pipeline components will be restored and deserialized including the entity
file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
pipeline components will be restored and deserialized including the entity
ruler. This lets you ship powerful pipeline packages with binary weights _and_
rules included!

View File

@ -58,12 +58,12 @@ arcs.
</Infobox>
| Argument | Description |
| --------- | ----------------------------------------------------------------------------------------- |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
| Argument | Description |
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options).