Merge branch 'v4' into feature/multiple-code-files

This commit is contained in:
Adriane Boyd 2023-03-17 08:44:10 +01:00 committed by GitHub
commit bae85e4c82
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
63 changed files with 980 additions and 358 deletions

View File

@ -69,6 +69,11 @@ steps:
# displayName: 'Test skip re-download (#12188)' # displayName: 'Test skip re-download (#12188)'
# condition: eq(variables['python_version'], '3.8') # condition: eq(variables['python_version'], '3.8')
# - script: |
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
# displayName: 'Test download_url in info CLI'
# condition: eq(variables['python_version'] '3.8')
- script: | - script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI' displayName: 'Test convert CLI'

View File

@ -16,7 +16,7 @@ jobs:
with: with:
ref: ${{ github.head_ref }} ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4 - uses: actions/setup-python@v4
- run: pip install black - run: pip install black -c requirements.txt
- name: Auto-format code if needed - name: Auto-format code if needed
run: black spacy run: black spacy
# We can't run black --check here because that returns a non-zero excit # We can't run black --check here because that returns a non-zero excit

View File

@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
Python modules. If you've built spaCy from source, you'll already have both Python modules. If you've built spaCy from source, you'll already have both
tools installed. tools installed.
As a general rule of thumb, we use f-strings for any formatting of strings.
One exception are calls to Python's `logging` functionality.
To avoid unnecessary string conversions in these cases, we use string formatting
templates with `%s` and `%d` etc.
**⚠️ Note that formatting and linting is currently only possible for Python **⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**

View File

@ -41,7 +41,7 @@ jobs:
inputs: inputs:
versionSpec: "3.8" versionSpec: "3.8"
- script: | - script: |
pip install black==22.3.0 pip install black -c requirements.txt
python -m black spacy --check python -m black spacy --check
displayName: "black" displayName: "black"
- script: | - script: |

View File

@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.990,<0.1000; platform_machine != "aarch64" mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-setuptools>=57.0.0 types-setuptools>=57.0.0
types-requests types-requests
types-setuptools>=57.0.0 types-setuptools>=57.0.0
black>=22.0,<23.0 black==22.3.0

View File

@ -90,9 +90,9 @@ def parse_config_overrides(
cli_overrides = _parse_overrides(args, is_cli=True) cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides: if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides] keys = [k for k in cli_overrides if k not in env_overrides]
logger.debug(f"Config overrides from CLI: {keys}") logger.debug("Config overrides from CLI: %s", keys)
if env_overrides: if env_overrides:
logger.debug(f"Config overrides from env variables: {list(env_overrides)}") logger.debug("Config overrides from env variables: %s", list(env_overrides))
return {**cli_overrides, **env_overrides} return {**cli_overrides, **env_overrides}

View File

@ -1,10 +1,10 @@
from typing import Optional, Dict, Any, Union, List from typing import Optional, Dict, Any, Union, List
import platform import platform
import pkg_resources
import json import json
from pathlib import Path from pathlib import Path
from wasabi import Printer, MarkdownRenderer from wasabi import Printer, MarkdownRenderer
import srsly import srsly
import importlib.metadata
from ._util import app, Arg, Opt, string_to_list from ._util import app, Arg, Opt, string_to_list
from .download import get_model_filename, get_latest_version from .download import get_model_filename, get_latest_version
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
dist-info available. dist-info available.
""" """
try: try:
dist = pkg_resources.get_distribution(model) dist = importlib.metadata.distribution(model)
data = json.loads(dist.get_metadata("direct_url.json")) text = dist.read_text("direct_url.json")
return data["url"] if isinstance(text, str):
except pkg_resources.DistributionNotFound: data = json.loads(text)
# no such package return data["url"]
return None
except Exception: except Exception:
# something else, like no file or invalid JSON pass
return None return None
def info_model_url(model: str) -> Dict[str, Any]: def info_model_url(model: str) -> Dict[str, Any]:

View File

@ -21,7 +21,6 @@ def init_vectors_cli(
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
# fmt: on # fmt: on
@ -44,7 +43,6 @@ def init_vectors_cli(
vectors_loc, vectors_loc,
truncate=truncate, truncate=truncate,
prune=prune, prune=prune,
name=name,
mode=mode, mode=mode,
) )
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")

View File

@ -252,7 +252,7 @@ def get_third_party_dependencies(
raise regerr from None raise regerr from None
module_name = func_info.get("module") # type: ignore[attr-defined] module_name = func_info.get("module") # type: ignore[attr-defined]
if module_name: # the code is part of a module, not a --code file if module_name: # the code is part of a module, not a --code file
modules.add(func_info["module"].split(".")[0]) # type: ignore[index] modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
dependencies = [] dependencies = []
for module_name in modules: for module_name in modules:
if module_name in distributions: if module_name in distributions:

View File

@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list. # in the list.
while commands: while commands:
for i, cmd in enumerate(list(commands)): for i, cmd in enumerate(list(commands)):
logger.debug(f"CMD: {cmd['name']}.") logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])] deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps): if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"]) cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []): for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash) url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug( logger.debug(
f"URL: {url} for {output_path} with command hash {cmd_hash}" "URL: %s for %s with command hash %s",
url,
output_path,
cmd_hash,
) )
yield url, output_path yield url, output_path
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
commands.pop(i) commands.pop(i)
break break
else: else:
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else: else:
# If we didn't break the for loop, break the while loop. # If we didn't break the for loop, break the while loop.
break break

View File

@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote] remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote) storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []): for cmd in config.get("commands", []):
logger.debug(f"CMD: cmd['name']") logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])] deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps): if any(not dep.exists() for dep in deps):
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue continue
cmd_hash = get_command_hash( cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
) )
logger.debug(f"CMD_HASH: {cmd_hash}") logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []): for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc): if output_loc.exists() and _is_not_empty_dir(output_loc):
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
content_hash=get_content_hash(output_loc), content_hash=get_content_hash(output_loc),
) )
logger.debug( logger.debug(
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
) )
yield output_path, url yield output_path, url

View File

@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path import os.path
from pathlib import Path from pathlib import Path
import pkg_resources
from wasabi import msg from wasabi import msg
from wasabi.util import locale_escape from wasabi.util import locale_escape
import sys import sys
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
exist. exist.
""" """
import pkg_resources
failed_pkgs_msgs: List[str] = [] failed_pkgs_msgs: List[str] = []
conflicting_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = []

View File

@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes):
E133 = ("The sum of prior probabilities for alias '{alias}' should not " E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.") "exceed 1, but found {sum}.")
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E139 = ("Knowledge base for component '{name}' is empty. Use the methods " E139 = ("Knowledge base for component '{name}' is empty.")
"`kb.add_entity` and `kb.add_alias` to add entries.")
E140 = ("The list of entities, prior probabilities and entity vectors " E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.") "should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the " E141 = ("Entity vectors should be of length {required} instead of the "
@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port=port)`") "with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_switch_port=True` to pick an available port automatically.") "or use `auto_select_port=True` to pick an available port automatically.")
# v4 error strings # v4 error strings
E4000 = ("Expected a Doc as input, but got: '{type}'") E4000 = ("Expected a Doc as input, but got: '{type}'")
@ -961,6 +960,7 @@ class Errors(metaclass=ErrorsWithCodes):
E4003 = ("Training examples for distillation must have the exact same tokens in the " E4003 = ("Training examples for distillation must have the exact same tokens in the "
"reference and predicted docs.") "reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.") E4004 = ("Backprop is not supported when is_train is not set.")
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

View File

@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._alias_index = PreshMap(nr_aliases + 1) self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1)
def is_empty(self):
return len(self) == 0
def __len__(self): def __len__(self):
return self.get_size_entities() return self.get_size_entities()

View File

@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
class SwedishDefaults(BaseDefaults): class SwedishDefaults(BaseDefaults):

View File

@ -0,0 +1,33 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
]
)
_suffixes = [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "s", "S", r"\'"]
]
_suffixes += [r"(?<=[^sSxXzZ])\'"]
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
@registry.misc("spacy.LookupsDataLoader.v1") @registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables): def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables) lookups = load_lookups(lang=lang, tables=tables)
return lookups return lookups
@ -174,8 +174,7 @@ class Language:
if not isinstance(vocab, Vocab) and vocab is not True: if not isinstance(vocab, Vocab) and vocab is not True:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vocab = create_vocab(self.lang, self.Defaults)
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -229,7 +228,6 @@ class Language:
"width": self.vocab.vectors_length, "width": self.vocab.vectors_length,
"vectors": len(self.vocab.vectors), "vectors": len(self.vocab.vectors),
"keys": self.vocab.vectors.n_keys, "keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name,
"mode": self.vocab.vectors.mode, "mode": self.vocab.vectors.mode,
} }
self._meta["labels"] = dict(self.pipe_labels) self._meta["labels"] = dict(self.pipe_labels)
@ -2074,7 +2072,7 @@ class Language:
pipe = self.get_pipe(pipe_name) pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name] pipe_cfg = self._pipe_configs[pipe_name]
if listeners: if listeners:
util.logger.debug(f"Replacing listeners of component '{pipe_name}'") util.logger.debug("Replacing listeners of component '%s'", pipe_name)
if len(list(listeners)) != len(pipe_listeners): if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't # The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update # match the listeners to replace, so we won't be able to update
@ -2197,9 +2195,6 @@ class Language:
if path.exists(): if path.exists():
data = srsly.read_json(path) data = srsly.read_json(path)
self.meta.update(data) self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
def deserialize_vocab(path: Path) -> None: def deserialize_vocab(path: Path) -> None:
if path.exists(): if path.exists():
@ -2268,9 +2263,6 @@ class Language:
def deserialize_meta(b): def deserialize_meta(b):
data = srsly.json_loads(b) data = srsly.json_loads(b)
self.meta.update(data) self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers: Dict[str, Callable[[bytes], Any]] = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes( deserializers["config.cfg"] = lambda b: self.config.from_bytes(

View File

@ -82,8 +82,12 @@ cdef class DependencyMatcher:
"$-": self._imm_left_sib, "$-": self._imm_left_sib,
"$++": self._right_sib, "$++": self._right_sib,
"$--": self._left_sib, "$--": self._left_sib,
">+": self._imm_right_child,
">-": self._imm_left_child,
">++": self._right_child, ">++": self._right_child,
">--": self._left_child, ">--": self._left_child,
"<+": self._imm_right_parent,
"<-": self._imm_left_parent,
"<++": self._right_parent, "<++": self._right_parent,
"<--": self._left_parent, "<--": self._left_parent,
} }
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
def _left_sib(self, doc, node): def _left_sib(self, doc, node):
return [doc[child.i] for child in doc[node].head.children if child.i < node] return [doc[child.i] for child in doc[node].head.children if child.i < node]
def _imm_right_child(self, doc, node):
for child in doc[node].children:
if child.i == node + 1:
return [doc[child.i]]
return []
def _imm_left_child(self, doc, node):
for child in doc[node].children:
if child.i == node - 1:
return [doc[child.i]]
return []
def _right_child(self, doc, node): def _right_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i > node] return [doc[child.i] for child in doc[node].children if child.i > node]
def _left_child(self, doc, node): def _left_child(self, doc, node):
return [doc[child.i] for child in doc[node].children if child.i < node] return [doc[child.i] for child in doc[node].children if child.i < node]
def _imm_right_parent(self, doc, node):
if doc[node].head.i == node + 1:
return [doc[node].head]
return []
def _imm_left_parent(self, doc, node):
if doc[node].head.i == node - 1:
return [doc[node].head]
return []
def _right_parent(self, doc, node): def _right_parent(self, doc, node):
if doc[node].head.i > node: if doc[node].head.i > node:
return [doc[node].head] return [doc[node].head]

View File

@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
return attr_values return attr_values
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
# tuple order affects performance
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
# These predicate helper classes are used to match the REGEX, IN, >= etc # These predicate helper classes are used to match the REGEX, IN, >= etc
# extensions to the matcher introduced in #3173. # extensions to the matcher introduced in #3173.
@ -848,7 +853,7 @@ class _FuzzyPredicate:
fuzz = self.predicate[len("FUZZY"):] # number after prefix fuzz = self.predicate[len("FUZZY"):] # number after prefix
self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy = int(fuzz) if fuzz else -1
self.fuzzy_compare = fuzzy_compare self.fuzzy_compare = fuzzy_compare
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
def __call__(self, Token token): def __call__(self, Token token):
if self.is_extension: if self.is_extension:
@ -870,7 +875,7 @@ class _RegexPredicate:
self.value = re.compile(value) self.value = re.compile(value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = _predicate_cache_key(self.attr, self.predicate, value)
if self.predicate not in self.operators: if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -906,7 +911,7 @@ class _SetPredicate:
self.value = set(get_string_id(v) for v in value) self.value = set(get_string_id(v) for v in value)
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
if self.predicate not in self.operators: if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -978,7 +983,7 @@ class _ComparisonPredicate:
self.value = value self.value = value
self.predicate = predicate self.predicate = predicate
self.is_extension = is_extension self.is_extension = is_extension
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) self.key = _predicate_cache_key(self.attr, self.predicate, value)
if self.predicate not in self.operators: if self.predicate not in self.operators:
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
if isinstance(value, dict): if isinstance(value, dict):
for type_, cls in predicate_types.items(): for type_, cls in predicate_types.items():
if type_ in value: if type_ in value:
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) key = _predicate_cache_key(attr, type_, value[type_])
if key in seen_predicates: if key in seen_predicates:
output.append(seen_predicates[key]) output.append(seen_predicates[key])
else: else:

View File

@ -89,6 +89,14 @@ def load_kb(
return kb_from_file return kb_from_file
@registry.misc("spacy.EmptyKB.v2")
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@registry.misc("spacy.EmptyKB.v1") @registry.misc("spacy.EmptyKB.v1")
def empty_kb( def empty_kb(
entity_vector_length: int, entity_vector_length: int,

View File

@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
cdef np.ndarray step_actions cdef np.ndarray step_actions
scores = [] scores = []
while sizes.states >= 1: while sizes.states >= 1 and (actions is None or len(actions) > 0):
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
step_actions = actions[0] if actions is not None else None step_actions = actions[0] if actions is not None else None
assert step_actions is None or step_actions.size == sizes.states, \
f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
with nogil: with nogil:
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes) _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
if actions is None: if actions is None:

View File

@ -58,6 +58,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"overwrite": False, "overwrite": False,
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True, "use_gold_ents": True,
"candidates_batch_size": 1, "candidates_batch_size": 1,
@ -84,6 +85,7 @@ def make_entity_linker(
get_candidates_batch: Callable[ get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
], ],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool, overwrite: bool,
scorer: Optional[Callable], scorer: Optional[Callable],
use_gold_ents: bool, use_gold_ents: bool,
@ -106,6 +108,7 @@ def make_entity_linker(
get_candidates_batch ( get_candidates_batch (
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method. scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations. component must provide entity annotations.
@ -114,28 +117,9 @@ def make_entity_linker(
prediction is discarded. If None, predictions are not filtered by any threshold. prediction is discarded. If None, predictions are not filtered by any threshold.
save_activations (bool): save model activations in Doc when annotating. save_activations (bool): save model activations in Doc when annotating.
""" """
if not model.attrs.get("include_span_maker", False): if not model.attrs.get("include_span_maker", False):
try: raise ValueError(Errors.E4005)
from spacy_legacy.components.entity_linker import EntityLinker_v1
except:
raise ImportError(
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
)
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
name,
labels_discard=labels_discard,
n_sents=n_sents,
incl_prior=incl_prior,
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
overwrite=overwrite,
scorer=scorer,
)
return EntityLinker( return EntityLinker(
nlp.vocab, nlp.vocab,
model, model,
@ -147,6 +131,7 @@ def make_entity_linker(
entity_vector_length=entity_vector_length, entity_vector_length=entity_vector_length,
get_candidates=get_candidates, get_candidates=get_candidates,
get_candidates_batch=get_candidates_batch, get_candidates_batch=get_candidates_batch,
generate_empty_kb=generate_empty_kb,
overwrite=overwrite, overwrite=overwrite,
scorer=scorer, scorer=scorer,
use_gold_ents=use_gold_ents, use_gold_ents=use_gold_ents,
@ -188,6 +173,7 @@ class EntityLinker(TrainablePipe):
get_candidates_batch: Callable[ get_candidates_batch: Callable[
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
], ],
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = entity_linker_score, scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool, use_gold_ents: bool,
@ -212,6 +198,7 @@ class EntityLinker(TrainablePipe):
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
Iterable[Candidate]] Iterable[Candidate]]
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
overwrite (bool): Whether to overwrite existing non-empty annotations. overwrite (bool): Whether to overwrite existing non-empty annotations.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
@ -219,6 +206,7 @@ class EntityLinker(TrainablePipe):
candidates_batch_size (int): Size of batches for entity candidate generation. candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
threshold, prediction is discarded. If None, predictions are not filtered by any threshold. threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
save_activations (bool): save model activations in Doc when annotating.
DOCS: https://spacy.io/api/entitylinker#init DOCS: https://spacy.io/api/entitylinker#init
""" """
@ -235,6 +223,7 @@ class EntityLinker(TrainablePipe):
self.model = model self.model = model
self.name = name self.name = name
self.labels_discard = list(labels_discard) self.labels_discard = list(labels_discard)
# how many neighbour sentences to take into account
self.n_sents = n_sents self.n_sents = n_sents
self.incl_prior = incl_prior self.incl_prior = incl_prior
self.incl_context = incl_context self.incl_context = incl_context
@ -242,9 +231,7 @@ class EntityLinker(TrainablePipe):
self.get_candidates_batch = get_candidates_batch self.get_candidates_batch = get_candidates_batch
self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False) self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account self.kb = generate_empty_kb(self.vocab, entity_vector_length)
# create an empty KB by default
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer self.scorer = scorer
self.use_gold_ents = use_gold_ents self.use_gold_ents = use_gold_ents
self.candidates_batch_size = candidates_batch_size self.candidates_batch_size = candidates_batch_size
@ -266,7 +253,7 @@ class EntityLinker(TrainablePipe):
# Raise an error if the knowledge base is not initialized. # Raise an error if the knowledge base is not initialized.
if self.kb is None: if self.kb is None:
raise ValueError(Errors.E1018.format(name=self.name)) raise ValueError(Errors.E1018.format(name=self.name))
if len(self.kb) == 0: if hasattr(self.kb, "is_empty") and self.kb.is_empty():
raise ValueError(Errors.E139.format(name=self.name)) raise ValueError(Errors.E139.format(name=self.name))
def initialize( def initialize(

View File

@ -1,5 +1,6 @@
from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple
from thinc.api import Model, set_dropout_rate, Optimizer, Config from thinc.api import Model, set_dropout_rate, Optimizer, Config
from thinc.types import Floats2d
from itertools import islice from itertools import islice
from .trainable_pipe import TrainablePipe from .trainable_pipe import TrainablePipe
@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe):
DOCS: https://spacy.io/api/tok2vec#update DOCS: https://spacy.io/api/tok2vec#update
""" """
if losses is None:
losses = {}
validate_examples(examples, "Tok2Vec.update") validate_examples(examples, "Tok2Vec.update")
docs = [eg.predicted for eg in examples] docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop) return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses)
tokvecs, bp_tokvecs = self.model.begin_update(docs)
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
losses.setdefault(self.name, 0.0)
def accumulate_gradient(one_d_tokvecs):
"""Accumulate tok2vec loss and gradient. This is passed as a callback
to all but the last listener. Only the last one does the backprop.
"""
nonlocal d_tokvecs
for i in range(len(one_d_tokvecs)):
d_tokvecs[i] += one_d_tokvecs[i]
losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
def backprop(one_d_tokvecs):
"""Callback to actually do the backprop. Passed to last listener."""
accumulate_gradient(one_d_tokvecs)
d_docs = bp_tokvecs(d_tokvecs)
if sgd is not None:
self.finish_update(sgd)
return d_docs
batch_id = Tok2VecListener.get_batch_id(docs)
for listener in self.listeners[:-1]:
listener.receive(batch_id, tokvecs, accumulate_gradient)
if self.listeners:
self.listeners[-1].receive(batch_id, tokvecs, backprop)
return losses
def get_loss(self, examples, scores) -> None: def get_loss(self, examples, scores) -> None:
pass pass
@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe):
def add_label(self, label): def add_label(self, label):
raise NotImplementedError raise NotImplementedError
def distill(
self,
teacher_pipe: Optional["TrainablePipe"],
examples: Iterable["Example"],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Performs an update of the student pipe's model using the
student's distillation examples and sets the annotations
of the teacher's distillation examples using the teacher pipe.
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
for prediction.
examples (Iterable[Example]): Distillation examples. The reference (teacher)
and predicted (student) docs must have the same number of tokens and the
same orthography.
drop (float): dropout rate.
sgd (Optional[Optimizer]): An optimizer. Will be created via
create_optimizer if not set.
losses (Optional[Dict[str, float]]): Optional record of loss during
distillation.
RETURNS: The updated losses dictionary.
DOCS: https://spacy.io/api/tok2vec#distill
"""
# By default we require a teacher pipe, but there are downstream
# implementations that don't require a pipe.
if teacher_pipe is None:
raise ValueError(Errors.E4002.format(name=self.name))
teacher_docs = [eg.reference for eg in examples]
student_docs = [eg.predicted for eg in examples]
teacher_preds = teacher_pipe.predict(teacher_docs)
teacher_pipe.set_annotations(teacher_docs, teacher_preds)
return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses)
def _update_with_docs(
self,
docs: Iterable[Doc],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
):
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
set_dropout_rate(self.model, drop)
tokvecs, accumulate_gradient, backprop = self._create_backprops(
docs, losses, sgd=sgd
)
batch_id = Tok2VecListener.get_batch_id(docs)
for listener in self.listeners[:-1]:
listener.receive(batch_id, tokvecs, accumulate_gradient)
if self.listeners:
self.listeners[-1].receive(batch_id, tokvecs, backprop)
return losses
def _create_backprops(
self,
docs: Iterable[Doc],
losses: Dict[str, float],
*,
sgd: Optional[Optimizer] = None,
) -> Tuple[Floats2d, Callable, Callable]:
tokvecs, bp_tokvecs = self.model.begin_update(docs)
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
def accumulate_gradient(one_d_tokvecs):
"""Accumulate tok2vec loss and gradient. This is passed as a callback
to all but the last listener. Only the last one does the backprop.
"""
nonlocal d_tokvecs
for i in range(len(one_d_tokvecs)):
d_tokvecs[i] += one_d_tokvecs[i]
losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
def backprop(one_d_tokvecs):
"""Callback to actually do the backprop. Passed to last listener."""
accumulate_gradient(one_d_tokvecs)
d_docs = bp_tokvecs(d_tokvecs)
if sgd is not None:
self.finish_update(sgd)
return d_docs
return tokvecs, accumulate_gradient, backprop
class Tok2VecListener(Model): class Tok2VecListener(Model):
"""A layer that gets fed its answers from an upstream connection, """A layer that gets fed its answers from an upstream connection,

View File

@ -36,6 +36,11 @@ from ..errors import Errors, Warnings
from .. import util from .. import util
# TODO: Remove when we switch to Cython 3.
cdef extern from "<algorithm>" namespace "std" nogil:
bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
NUMPY_OPS = NumpyOps() NUMPY_OPS = NumpyOps()
@ -253,8 +258,8 @@ class Parser(TrainablePipe):
# batch uniform length. Since we do not have a gold standard # batch uniform length. Since we do not have a gold standard
# sequence, we use the teacher's predictions as the gold # sequence, we use the teacher's predictions as the gold
# standard. # standard.
max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
states = self._init_batch(teacher_pipe, student_docs, max_moves) states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
else: else:
states = self.moves.init_batch(student_docs) states = self.moves.init_batch(student_docs)
@ -265,12 +270,12 @@ class Parser(TrainablePipe):
# gradients of the student's transition distributions relative to the # gradients of the student's transition distributions relative to the
# teacher's distributions. # teacher's distributions.
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves, student_inputs = TransitionModelInputs(docs=student_docs,
max_moves=max_moves) states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = states2actions(student_states) actions = _states_diff_to_actions(states, student_states)
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
moves=self.moves, actions=actions) states=states, moves=teacher_pipe.moves, actions=actions)
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
@ -522,7 +527,7 @@ class Parser(TrainablePipe):
set_dropout_rate(self.model, 0.0) set_dropout_rate(self.model, 0.0)
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = states2actions(student_states) actions = _states_to_actions(student_states)
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs) _, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
@ -642,7 +647,7 @@ class Parser(TrainablePipe):
raise ValueError(Errors.E149) from None raise ValueError(Errors.E149) from None
return self return self
def _init_batch(self, teacher_step_model, docs, max_length): def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
"""Make a square batch of length equal to the shortest transition """Make a square batch of length equal to the shortest transition
sequence or a cap. A long sequence or a cap. A long
doc will get multiple states. Let's say we have a doc of length 2*N, doc will get multiple states. Let's say we have a doc of length 2*N,
@ -651,10 +656,12 @@ class Parser(TrainablePipe):
_init_gold_batch, this version uses a teacher model to generate the _init_gold_batch, this version uses a teacher model to generate the
cut sequences.""" cut sequences."""
cdef: cdef:
StateClass start_state
StateClass state StateClass state
Transition action TransitionSystem moves = teacher_pipe.moves
all_states = self.moves.init_batch(docs)
# Start with the same heuristic as in supervised training: exclude
# docs that are within the maximum length.
all_states = moves.init_batch(docs)
states = [] states = []
to_cut = [] to_cut = []
for state, doc in zip(all_states, docs): for state, doc in zip(all_states, docs):
@ -663,18 +670,28 @@ class Parser(TrainablePipe):
states.append(state) states.append(state)
else: else:
to_cut.append(state) to_cut.append(state)
if not to_cut:
return states
# Parse the states that are too long with the teacher's parsing model.
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
states=[state.copy() for state in to_cut])
(teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
# Step through the teacher's actions and store every state after
# each multiple of max_length.
teacher_actions = _states_to_actions(teacher_states)
while to_cut: while to_cut:
states.extend(state.copy() for state in to_cut) states.extend(state.copy() for state in to_cut)
# Move states forward max_length actions. for step_actions in teacher_actions[:max_length]:
length = 0 to_cut = moves.apply_actions(to_cut, step_actions)
while to_cut and length < max_length: teacher_actions = teacher_actions[max_length:]
teacher_scores = teacher_step_model.predict(to_cut)
self.transition_states(to_cut, teacher_scores)
# States that are completed do not need further cutting.
to_cut = [state for state in to_cut if not state.is_final()]
length += 1
return states
if len(teacher_actions) < max_length:
break
return states
def _init_gold_batch(self, examples, max_length): def _init_gold_batch(self, examples, max_length):
"""Make a square batch, of length equal to the shortest transition """Make a square batch, of length equal to the shortest transition
@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs):
model.attrs[key] = value model.attrs[key] = value
def states2actions(states: List[StateClass]) -> List[Ints1d]: def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
cdef int step cdef int step
cdef StateClass state cdef StateClass state
cdef StateC* c_state cdef StateC* c_state
@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
actions.append(numpy.array(step_actions, dtype="i")) actions.append(numpy.array(step_actions, dtype="i"))
return actions return actions
def _states_diff_to_actions(
before_states: List[StateClass],
after_states: List[StateClass]
) -> List[Ints1d]:
"""
Return for two sets of states the actions to go from the first set of
states to the second set of states. The histories of the first set of
states must be a prefix of the second set of states.
"""
cdef StateClass before_state, after_state
cdef StateC* c_state_before
cdef StateC* c_state_after
assert len(before_states) == len(after_states)
# Check invariant: before states histories must be prefixes of after states.
for before_state, after_state in zip(before_states, after_states):
c_state_before = before_state.c
c_state_after = after_state.c
assert equal(c_state_before.history.begin(), c_state_before.history.end(),
c_state_after.history.begin())
actions = []
while True:
step = len(actions)
step_actions = []
for before_state, after_state in zip(before_states, after_states):
c_state_before = before_state.c
c_state_after = after_state.c
if step < c_state_after.history.size() - c_state_before.history.size():
step_actions.append(c_state_after.history[c_state_before.history.size() + step])
# We are done if we have exhausted all histories.
if len(step_actions) == 0:
break
actions.append(numpy.array(step_actions, dtype="i"))
return actions

View File

@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
tokens = sv_tokenizer(text) tokens = sv_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
@pytest.mark.issue(12311)
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 1

View File

@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
("the", "brown", "$--", 0), ("the", "brown", "$--", 0),
("brown", "the", "$--", 1), ("brown", "the", "$--", 1),
("brown", "brown", "$--", 0), ("brown", "brown", "$--", 0),
("over", "jumped", "<+", 0),
("quick", "fox", "<+", 0),
("the", "quick", "<+", 0),
("brown", "fox", "<+", 1),
("quick", "fox", "<++", 1), ("quick", "fox", "<++", 1),
("quick", "over", "<++", 0), ("quick", "over", "<++", 0),
("over", "jumped", "<++", 0), ("over", "jumped", "<++", 0),
("the", "fox", "<++", 2), ("the", "fox", "<++", 2),
("brown", "fox", "<-", 0),
("fox", "over", "<-", 0),
("the", "over", "<-", 0),
("over", "jumped", "<-", 1),
("brown", "fox", "<--", 0), ("brown", "fox", "<--", 0),
("fox", "jumped", "<--", 0), ("fox", "jumped", "<--", 0),
("fox", "over", "<--", 1), ("fox", "over", "<--", 1),
("fox", "brown", ">+", 0),
("over", "fox", ">+", 0),
("over", "the", ">+", 0),
("jumped", "over", ">+", 1),
("jumped", "over", ">++", 1), ("jumped", "over", ">++", 1),
("fox", "lazy", ">++", 0), ("fox", "lazy", ">++", 0),
("over", "the", ">++", 0), ("over", "the", ">++", 0),
("jumped", "over", ">-", 0),
("fox", "quick", ">-", 0),
("brown", "quick", ">-", 0),
("fox", "brown", ">-", 1),
("brown", "fox", ">--", 0), ("brown", "fox", ">--", 0),
("fox", "brown", ">--", 1), ("fox", "brown", ">--", 1),
("jumped", "fox", ">--", 1), ("jumped", "fox", ">--", 1),

View File

@ -0,0 +1,61 @@
import numpy
import pytest
from spacy.lang.en import English
from spacy.ml.tb_framework import TransitionModelInputs
from spacy.training import Example
TRAIN_DATA = [
(
"They trade mortgage-backed securities.",
{
"heads": [1, 1, 4, 4, 5, 1, 1],
"deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
},
),
(
"I like London and Berlin.",
{
"heads": [1, 1, 1, 2, 2, 1],
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
},
),
]
@pytest.fixture
def nlp_parser():
nlp = English()
parser = nlp.add_pipe("parser")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations["deps"]:
parser.add_label(dep)
nlp.initialize()
return nlp, parser
def test_incorrect_number_of_actions(nlp_parser):
nlp, parser = nlp_parser
doc = nlp.make_doc("test")
# Too many actions for the number of docs
with pytest.raises(AssertionError):
parser.model.predict(
TransitionModelInputs(
docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
)
)
# Too few actions for the number of docs
with pytest.raises(AssertionError):
parser.model.predict(
TransitionModelInputs(
docs=[doc, doc],
moves=parser.moves,
actions=[numpy.array([0], dtype="i")],
)
)

View File

@ -623,7 +623,9 @@ def test_is_distillable():
assert ner.is_distillable assert ner.is_distillable
def test_distill(): @pytest.mark.slow
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
def test_distill(max_moves):
teacher = English() teacher = English()
teacher_ner = teacher.add_pipe("ner") teacher_ner = teacher.add_pipe("ner")
train_examples = [] train_examples = []
@ -641,6 +643,7 @@ def test_distill():
student = English() student = English()
student_ner = student.add_pipe("ner") student_ner = student.add_pipe("ner")
student_ner.cfg["update_with_oracle_cut_size"] = max_moves
student_ner.initialize( student_ner.initialize(
get_examples=lambda: train_examples, labels=teacher_ner.label_data get_examples=lambda: train_examples, labels=teacher_ner.label_data
) )

View File

@ -463,7 +463,9 @@ def test_is_distillable():
assert parser.is_distillable assert parser.is_distillable
def test_distill(): @pytest.mark.slow
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
def test_distill(max_moves):
teacher = English() teacher = English()
teacher_parser = teacher.add_pipe("parser") teacher_parser = teacher.add_pipe("parser")
train_examples = [] train_examples = []
@ -481,6 +483,7 @@ def test_distill():
student = English() student = English()
student_parser = student.add_pipe("parser") student_parser = student.add_pipe("parser")
student_parser.cfg["update_with_oracle_cut_size"] = max_moves
student_parser.initialize( student_parser.initialize(
get_examples=lambda: train_examples, labels=teacher_parser.label_data get_examples=lambda: train_examples, labels=teacher_parser.label_data
) )

View File

@ -353,6 +353,9 @@ def test_kb_default(nlp):
"""Test that the default (empty) KB is loaded upon construction""" """Test that the default (empty) KB is loaded upon construction"""
entity_linker = nlp.add_pipe("entity_linker", config={}) entity_linker = nlp.add_pipe("entity_linker", config={})
assert len(entity_linker.kb) == 0 assert len(entity_linker.kb) == 0
with pytest.raises(ValueError, match="E139"):
# this raises an error because the KB is empty
entity_linker.validate_kb()
assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_entities() == 0
assert entity_linker.kb.get_size_aliases() == 0 assert entity_linker.kb.get_size_aliases() == 0
# 64 is the default value from pipeline.entity_linker # 64 is the default value from pipeline.entity_linker
@ -990,13 +993,11 @@ def test_scorer_links():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name,config", "name,config",
[ [
("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}),
], ],
) )
# fmt: on # fmt: on
def test_legacy_architectures(name, config): def test_legacy_architectures(name, config):
from spacy_legacy.components.entity_linker import EntityLinker_v1
# Ensure that the legacy architectures still work # Ensure that the legacy architectures still work
vector_length = 3 vector_length = 3
@ -1019,10 +1020,7 @@ def test_legacy_architectures(name, config):
return mykb return mykb
entity_linker = nlp.add_pipe(name, config={"model": config}) entity_linker = nlp.add_pipe(name, config={"model": config})
if config["@architectures"] == "spacy.EntityLinker.v1": assert isinstance(entity_linker, EntityLinker)
assert isinstance(entity_linker, EntityLinker_v1)
else:
assert isinstance(entity_linker, EntityLinker)
entity_linker.set_kb(create_kb) entity_linker.set_kb(create_kb)
optimizer = nlp.initialize(get_examples=lambda: train_examples) optimizer = nlp.initialize(get_examples=lambda: train_examples)

View File

@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat():
assert cats1["imperative"] < 0.9 assert cats1["imperative"] < 0.9
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
cfg_string_distillation = """
[nlp]
lang = "en"
pipeline = ["tok2vec","tagger"]
[components]
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
rows = [2000, 1000, 1000, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
"""
def test_tok2vec_distillation_teacher_annotations():
orig_config = Config().from_str(cfg_string_distillation)
teacher_nlp = util.load_model_from_config(
orig_config, auto_fill=True, validate=True
)
student_nlp = util.load_model_from_config(
orig_config, auto_fill=True, validate=True
)
train_examples_teacher = []
train_examples_student = []
for t in TRAIN_DATA:
train_examples_teacher.append(
Example.from_dict(teacher_nlp.make_doc(t[0]), t[1])
)
train_examples_student.append(
Example.from_dict(student_nlp.make_doc(t[0]), t[1])
)
optimizer = teacher_nlp.initialize(lambda: train_examples_teacher)
student_nlp.initialize(lambda: train_examples_student)
# Since Language.distill creates a copy of the examples to use as
# its internal teacher/student docs, we'll need to monkey-patch the
# tok2vec pipe's distill method.
student_tok2vec = student_nlp.get_pipe("tok2vec")
student_tok2vec._old_distill = student_tok2vec.distill
def tok2vec_distill_wrapper(
self,
teacher_pipe,
examples,
**kwargs,
):
assert all(not eg.reference.tensor.any() for eg in examples)
out = self._old_distill(teacher_pipe, examples, **kwargs)
assert all(eg.reference.tensor.any() for eg in examples)
return out
student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec)
student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={})

View File

@ -1,7 +1,10 @@
from typing import Callable from pathlib import Path
from typing import Callable, Iterable, Any, Dict
from spacy import util import srsly
from spacy.util import ensure_path, registry, load_model_from_config
from spacy import util, Errors
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.kb.kb_in_memory import InMemoryLookupKB
from spacy.vocab import Vocab from spacy.vocab import Vocab
from thinc.api import Config from thinc.api import Config
@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
[components.entity_linker] [components.entity_linker]
factory = "entity_linker" factory = "entity_linker"
[components.entity_linker.generate_empty_kb]
@misc = "kb_test.CustomEmptyKB.v1"
[initialize] [initialize]
[initialize.components] [initialize.components]
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
[initialize.components.entity_linker] [initialize.components.entity_linker]
[initialize.components.entity_linker.kb_loader] [initialize.components.entity_linker.kb_loader]
@misc = "spacy.CustomKB.v1" @misc = "kb_test.CustomKB.v1"
entity_vector_length = 342 entity_vector_length = 342
custom_field = 666 custom_field = 666
""" """
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
super().__init__(vocab, entity_vector_length) super().__init__(vocab, entity_vector_length)
self.custom_field = custom_field self.custom_field = custom_field
@registry.misc("spacy.CustomKB.v1") def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
path = ensure_path(path)
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
def serialize_custom_fields(file_path: Path) -> None:
srsly.write_json(file_path, {"custom_field": self.custom_field})
serialize = {
"contents": lambda p: self.write_contents(p),
"strings.json": lambda p: self.vocab.strings.to_disk(p),
"custom_fields": lambda p: serialize_custom_fields(p),
}
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
path = ensure_path(path)
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
if not path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
def deserialize_custom_fields(file_path: Path) -> None:
self.custom_field = srsly.read_json(file_path)["custom_field"]
deserialize: Dict[str, Callable[[Any], Any]] = {
"contents": lambda p: self.read_contents(p),
"strings.json": lambda p: self.vocab.strings.from_disk(p),
"custom_fields": lambda p: deserialize_custom_fields(p),
}
util.from_disk(path, deserialize, exclude)
@registry.misc("kb_test.CustomEmptyKB.v1")
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
return SubInMemoryLookupKB(
vocab=vocab,
entity_vector_length=entity_vector_length,
custom_field=0,
)
return empty_kb_factory
@registry.misc("kb_test.CustomKB.v1")
def custom_kb( def custom_kb(
entity_vector_length: int, custom_field: int entity_vector_length: int, custom_field: int
) -> Callable[[Vocab], InMemoryLookupKB]: ) -> Callable[[Vocab], SubInMemoryLookupKB]:
def custom_kb_factory(vocab): def custom_kb_factory(vocab):
kb = SubInMemoryLookupKB( kb = SubInMemoryLookupKB(
vocab=vocab, vocab=vocab,
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
entity_linker2 = nlp2.get_pipe("entity_linker") entity_linker2 = nlp2.get_pipe("entity_linker")
# After IO, the KB is the standard one # After IO, the KB is the standard one
assert type(entity_linker2.kb) == InMemoryLookupKB assert type(entity_linker2.kb) == SubInMemoryLookupKB
assert entity_linker2.kb.entity_vector_length == 342 assert entity_linker2.kb.entity_vector_length == 342
assert not hasattr(entity_linker2.kb, "custom_field") assert entity_linker2.kb.custom_field == 666

View File

@ -181,7 +181,7 @@ def test_issue4042_bug2():
@pytest.mark.issue(4725) @pytest.mark.issue(4725)
def test_issue4725_1(): def test_issue4725_1():
"""Ensure the pickling of the NER goes well""" """Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
nlp = English(vocab=vocab) nlp = English(vocab=vocab)
config = { config = {
"update_with_oracle_cut_size": 111, "update_with_oracle_cut_size": 111,

View File

@ -2,7 +2,6 @@ import os
import math import math
from collections import Counter from collections import Counter
from typing import Tuple, List, Dict, Any from typing import Tuple, List, Dict, Any
import pkg_resources
import time import time
from pathlib import Path from pathlib import Path
@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys):
) )
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"reqs,output", "reqs,output",
[ [
@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys):
], ],
) )
def test_project_check_requirements(reqs, output): def test_project_check_requirements(reqs, output):
import pkg_resources
# excessive guard against unlikely package name # excessive guard against unlikely package name
try: try:
pkg_resources.require("spacyunknowndoesnotexist12345") pkg_resources.require("spacyunknowndoesnotexist12345")

View File

@ -1,6 +1,7 @@
import os import os
from pathlib import Path from pathlib import Path
import pytest import pytest
import srsly
import subprocess import subprocess
from typer.testing import CliRunner from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc from spacy.tokens import DocBin, Doc
@ -298,3 +299,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
# Instead of checking specific wording of the output, which may change, # Instead of checking specific wording of the output, which may change,
# we'll check that this section of the debug output is present. # we'll check that this section of the debug output is present.
assert "= Trainable Lemmatizer =" in result_debug_data.stdout assert "= Trainable Lemmatizer =" in result_debug_data.stdout
# project tests
SAMPLE_PROJECT = {
"title": "Sample project",
"description": "This is a project for testing",
"assets": [
{
"dest": "assets/spacy-readme.md",
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
},
{
"dest": "assets/citation.cff",
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
"checksum": "c996bfd80202d480eb2e592369714e5e",
"extra": True,
},
],
"commands": [
{
"name": "ok",
"help": "print ok",
"script": ["python -c \"print('okokok')\""],
},
{
"name": "create",
"help": "make a file",
"script": ["touch abc.txt"],
"outputs": ["abc.txt"],
},
{
"name": "clean",
"help": "remove test file",
"script": ["rm abc.txt"],
},
],
}
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
@pytest.fixture
def project_dir():
with make_tempdir() as pdir:
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
yield pdir
def test_project_document(project_dir):
readme_path = project_dir / "README.md"
assert not readme_path.exists(), "README already exists"
result = CliRunner().invoke(
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
)
assert result.exit_code == 0
assert readme_path.is_file()
text = readme_path.read_text("utf-8")
assert SAMPLE_PROJECT["description"] in text
def test_project_assets(project_dir):
asset_dir = project_dir / "assets"
assert not asset_dir.exists(), "Assets dir is already present"
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
assert result.exit_code == 0
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
# check that extras work
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
assert result.exit_code == 0
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
def test_project_run(project_dir):
# make sure dry run works
test_file = project_dir / "abc.txt"
result = CliRunner().invoke(
app, ["project", "run", "--dry", "create", str(project_dir)]
)
assert result.exit_code == 0
assert not test_file.is_file()
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
assert result.exit_code == 0
assert "okokok" in result.stdout
@pytest.mark.parametrize(
"options",
[
"",
# "--sparse",
"--branch v3",
"--repo https://github.com/explosion/projects --branch v3",
],
)
def test_project_clone(options):
with make_tempdir() as workspace:
out = workspace / "project"
target = "benchmarks/ner_conll03"
if not options:
options = []
else:
options = options.split()
result = CliRunner().invoke(
app, ["project", "clone", target, *options, str(out)]
)
assert result.exit_code == 0
assert (out / "README.md").is_file()
def test_project_push_pull(project_dir):
proj = dict(SAMPLE_PROJECT)
remote = "xyz"
with make_tempdir() as remote_dir:
proj["remotes"] = {remote: str(remote_dir)}
proj_text = srsly.yaml_dumps(proj)
(project_dir / "project.yml").write_text(proj_text)
test_file = project_dir / "abc.txt"
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
assert result.exit_code == 0
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
assert result.exit_code == 0
assert not test_file.exists()
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
assert result.exit_code == 0
assert test_file.is_file()

View File

@ -98,7 +98,7 @@ def assert_sents_error(doc):
def warn_error(proc_name, proc, docs, e): def warn_error(proc_name, proc, docs, e):
logger = logging.getLogger("spacy") logger = logging.getLogger("spacy")
logger.warning(f"Trouble with component {proc_name}.") logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture @pytest.fixture

View File

@ -84,7 +84,7 @@ def test_issue1539():
@pytest.mark.issue(1807) @pytest.mark.issue(1807)
def test_issue1807(): def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab.""" """Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab(vectors_name="test_issue1807") vocab = Vocab()
assert "hello" not in vocab assert "hello" not in vocab
vocab.set_vector("hello", numpy.ones((50,), dtype="f")) vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
assert "hello" in vocab assert "hello" in vocab
@ -94,13 +94,12 @@ def test_issue1807():
def test_issue2871(): def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words.""" """Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"] words = ["dog", "cat", "SUFFIX"]
vocab = Vocab(vectors_name="test_issue2871") vocab = Vocab()
vocab.vectors.resize(shape=(3, 10)) vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype="f") vector_data = numpy.zeros((3, 10), dtype="f")
for word in words: for word in words:
_ = vocab[word] # noqa: F841 _ = vocab[word] # noqa: F841
vocab.set_vector(word, vector_data[0]) vocab.set_vector(word, vector_data[0])
vocab.vectors.name = "dummy_vectors"
assert vocab["dog"].rank == 0 assert vocab["dog"].rank == 0
assert vocab["cat"].rank == 1 assert vocab["cat"].rank == 1
assert vocab["SUFFIX"].rank == 2 assert vocab["SUFFIX"].rank == 2
@ -125,7 +124,7 @@ def test_issue4725_2():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors # ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
# or because of issues with pickling the NER (cf test_issue4725_1) # or because of issues with pickling the NER (cf test_issue4725_1)
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
data = numpy.ndarray((5, 3), dtype="f") data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
def test_vocab_add_vector(): def test_vocab_add_vector():
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
data = OPS.xp.ndarray((5, 3), dtype="f") data = OPS.xp.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -356,7 +355,7 @@ def test_vocab_add_vector():
def test_vocab_prune_vectors(): def test_vocab_prune_vectors():
vocab = Vocab(vectors_name="test_vocab_prune_vectors") vocab = Vocab()
_ = vocab["cat"] # noqa: F841 _ = vocab["cat"] # noqa: F841
_ = vocab["dog"] # noqa: F841 _ = vocab["dog"] # noqa: F841
_ = vocab["kitten"] # noqa: F841 _ = vocab["kitten"] # noqa: F841
@ -405,7 +404,7 @@ def test_vectors_serialize():
def test_vector_is_oov(): def test_vector_is_oov():
vocab = Vocab(vectors_name="test_vocab_is_oov") vocab = Vocab()
data = OPS.xp.ndarray((5, 3), dtype="f") data = OPS.xp.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0

View File

@ -105,6 +105,7 @@ class Doc:
start_idx: int, start_idx: int,
end_idx: int, end_idx: int,
label: Union[int, str] = ..., label: Union[int, str] = ...,
*,
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
alignment_mode: str = ..., alignment_mode: str = ...,
@ -127,12 +128,12 @@ class Doc:
blocked: Optional[List[Span]] = ..., blocked: Optional[List[Span]] = ...,
missing: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ...,
outside: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ...,
default: str = ... default: str = ...,
) -> None: ... ) -> None: ...
@property @property
def noun_chunks(self) -> Iterator[Span]: ... def noun_chunks(self) -> Tuple[Span]: ...
@property @property
def sents(self) -> Iterator[Span]: ... def sents(self) -> Tuple[Span]: ...
@property @property
def lang(self) -> int: ... def lang(self) -> int: ...
@property @property

View File

@ -520,7 +520,7 @@ cdef class Doc:
def doc(self): def doc(self):
return self return self
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice """Create a `Span` object from the slice
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
created. created.
@ -657,9 +657,6 @@ cdef class Doc:
elif self.vocab.vectors.size > 0: elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self) self._vector = sum(t.vector for t in self) / len(self)
return self._vector return self._vector
elif self.tensor.size > 0:
self._vector = self.tensor.mean(axis=0)
return self._vector
else: else:
return xp.zeros((self.vocab.vectors_length,), dtype="float32") return xp.zeros((self.vocab.vectors_length,), dtype="float32")
@ -706,10 +703,10 @@ cdef class Doc:
return self.text return self.text
property ents: property ents:
"""The named entities in the document. Returns a tuple of named entity """The named entities in the document. Returns a list of named entity
`Span` objects, if the entity recognizer has been applied. `Span` objects, if the entity recognizer has been applied.
RETURNS (tuple): Entities in the document, one `Span` per entity. RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
DOCS: https://spacy.io/api/doc#ents DOCS: https://spacy.io/api/doc#ents
""" """
@ -867,7 +864,7 @@ cdef class Doc:
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
YIELDS (Span): Noun chunks in the document. RETURNS (Tuple[Span]): Noun chunks in the document.
DOCS: https://spacy.io/api/doc#noun_chunks DOCS: https://spacy.io/api/doc#noun_chunks
""" """
@ -876,36 +873,35 @@ cdef class Doc:
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenization from being changed out from under us # prevents the tokenization from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration.
# its tokenization changing, so it's okay once we have the Span
# objects. See Issue #375.
spans = [] spans = []
for start, end, label in self.noun_chunks_iterator(self): for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
for span in spans: return tuple(spans)
yield span
@property @property
def sents(self): def sents(self):
"""Iterate over the sentences in the document. Yields sentence `Span` """Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label. objects. Sentence spans have no label.
YIELDS (Span): Sentences in the document. RETURNS (Tuple[Span]): Sentences in the document.
DOCS: https://spacy.io/api/doc#sents DOCS: https://spacy.io/api/doc#sents
""" """
if not self.has_annotation("SENT_START"): if not self.has_annotation("SENT_START"):
raise ValueError(Errors.E030) raise ValueError(Errors.E030)
if "sents" in self.user_hooks: if "sents" in self.user_hooks:
yield from self.user_hooks["sents"](self) return tuple(self.user_hooks["sents"](self))
else: else:
start = 0 start = 0
spans = []
for i in range(1, self.length): for i in range(1, self.length):
if self.c[i].sent_start == 1: if self.c[i].sent_start == 1:
yield Span(self, start, i) spans.append(Span(self, start, i))
start = i start = i
if start != self.length: if start != self.length:
yield Span(self, start, self.length) spans.append(Span(self, start, self.length))
return tuple(spans)
@property @property
def lang(self): def lang(self):
@ -1605,7 +1601,7 @@ cdef class Doc:
for span_group in doc_json.get("spans", {}): for span_group in doc_json.get("spans", {}):
spans = [] spans = []
for span in doc_json["spans"][span_group]: for span in doc_json["spans"][span_group]:
char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"]) char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
if char_span is None: if char_span is None:
raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"])) raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
spans.append(char_span) spans.append(char_span)

View File

@ -74,6 +74,8 @@ class Span:
@property @property
def ents(self) -> Tuple[Span]: ... def ents(self) -> Tuple[Span]: ...
@property @property
def sents(self) -> Tuple[Span]: ...
@property
def has_vector(self) -> bool: ... def has_vector(self) -> bool: ...
@property @property
def vector(self) -> Floats1d: ... def vector(self) -> Floats1d: ...
@ -86,7 +88,7 @@ class Span:
@property @property
def text_with_ws(self) -> str: ... def text_with_ws(self) -> str: ...
@property @property
def noun_chunks(self) -> Iterator[Span]: ... def noun_chunks(self) -> Tuple[Span]: ...
@property @property
def root(self) -> Token: ... def root(self) -> Token: ...
def char_span( def char_span(
@ -94,9 +96,9 @@ class Span:
start_idx: int, start_idx: int,
end_idx: int, end_idx: int,
label: Union[int, str] = ..., label: Union[int, str] = ...,
*,
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
id: Union[int, str] = ...,
alignment_mode: str = ..., alignment_mode: str = ...,
span_id: Union[int, str] = ..., span_id: Union[int, str] = ...,
) -> Span: ... ) -> Span: ...

View File

@ -134,10 +134,8 @@ cdef class Span:
else: else:
return True return True
cdef SpanC* span_c = self.span_c() self_tuple = self._cmp_tuple()
cdef SpanC* other_span_c = other.span_c() other_tuple = other._cmp_tuple()
self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc)
other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc)
# < # <
if op == 0: if op == 0:
return self_tuple < other_tuple return self_tuple < other_tuple
@ -158,8 +156,20 @@ cdef class Span:
return self_tuple >= other_tuple return self_tuple >= other_tuple
def __hash__(self): def __hash__(self):
return hash(self._cmp_tuple())
def _cmp_tuple(self):
cdef SpanC* span_c = self.span_c() cdef SpanC* span_c = self.span_c()
return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id)) return (
span_c.start_char,
span_c.end_char,
span_c.start,
span_c.end,
span_c.label,
span_c.kb_id,
span_c.id,
self.doc,
)
def __len__(self): def __len__(self):
"""Get the number of tokens in the span. """Get the number of tokens in the span.
@ -451,20 +461,21 @@ cdef class Span:
"""Obtain the sentences that contain this span. If the given span """Obtain the sentences that contain this span. If the given span
crosses sentence boundaries, return all sentences it is a part of. crosses sentence boundaries, return all sentences it is a part of.
RETURNS (Iterable[Span]): All sentences that the span is a part of. RETURNS (Tuple[Span]): All sentences that the span is a part of.
DOCS: https://spacy.io/api/span#sents DOCS: https://spacy.io/api/span#sents
""" """
cdef int start cdef int start
cdef int i cdef int i
if "sents" in self.doc.user_span_hooks: if "sents" in self.doc.user_span_hooks:
yield from self.doc.user_span_hooks["sents"](self) return tuple(self.doc.user_span_hooks["sents"](self))
elif "sents" in self.doc.user_hooks: spans = []
if "sents" in self.doc.user_hooks:
for sentence in self.doc.user_hooks["sents"](self.doc): for sentence in self.doc.user_hooks["sents"](self.doc):
if sentence.end > self.start: if sentence.end > self.start:
if sentence.start < self.end or sentence.start == self.start == self.end: if sentence.start < self.end or sentence.start == self.start == self.end:
yield sentence spans.append(sentence)
else: else:
break break
else: else:
@ -479,12 +490,13 @@ cdef class Span:
# Now, find all the sentences in the span # Now, find all the sentences in the span
for i in range(start + 1, self.doc.length): for i in range(start + 1, self.doc.length):
if self.doc.c[i].sent_start == 1: if self.doc.c[i].sent_start == 1:
yield Span(self.doc, start, i) spans.append(Span(self.doc, start, i))
start = i start = i
if start >= self.end: if start >= self.end:
break break
if start < self.end: if start < self.end:
yield Span(self.doc, start, self.end) spans.append(Span(self.doc, start, self.end))
return tuple(spans)
@property @property
@ -492,7 +504,7 @@ cdef class Span:
"""The named entities that fall completely within the span. Returns """The named entities that fall completely within the span. Returns
a tuple of `Span` objects. a tuple of `Span` objects.
RETURNS (tuple): Entities in the span, one `Span` per entity. RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
DOCS: https://spacy.io/api/span#ents DOCS: https://spacy.io/api/span#ents
""" """
@ -507,7 +519,7 @@ cdef class Span:
ents.append(ent) ents.append(ent)
else: else:
break break
return ents return tuple(ents)
@property @property
def has_vector(self): def has_vector(self):
@ -522,8 +534,6 @@ cdef class Span:
return self.doc.user_span_hooks["has_vector"](self) return self.doc.user_span_hooks["has_vector"](self)
elif self.vocab.vectors.size > 0: elif self.vocab.vectors.size > 0:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0:
return True
else: else:
return False return False
@ -605,13 +615,15 @@ cdef class Span:
NP-level coordination, no prepositional phrases, and no relative NP-level coordination, no prepositional phrases, and no relative
clauses. clauses.
YIELDS (Span): Noun chunks in the span. RETURNS (Tuple[Span]): Noun chunks in the span.
DOCS: https://spacy.io/api/span#noun_chunks DOCS: https://spacy.io/api/span#noun_chunks
""" """
spans = []
for span in self.doc.noun_chunks: for span in self.doc.noun_chunks:
if span.start >= self.start and span.end <= self.end: if span.start >= self.start and span.end <= self.end:
yield span spans.append(span)
return tuple(spans)
@property @property
def root(self): def root(self):
@ -656,17 +668,16 @@ cdef class Span:
else: else:
return self.doc[root] return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice `span.text[start : end]`. """Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span. start_idx (int): The index of the first character of the span.
end (int): The index of the first character after the span. end_idx (int): The index of the first character after the span.
label (Union[int, str]): A label to attach to the Span, e.g. for label (Union[int, str]): A label to attach to the Span, e.g. for
named entities. named entities.
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
id (Union[int, str]): Unused.
alignment_mode (str): How character indices are aligned to token alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely with token boundaries), "contract" (span of all tokens completely

View File

@ -389,8 +389,6 @@ cdef class Token:
""" """
if "has_vector" in self.doc.user_token_hooks: if "has_vector" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["has_vector"](self) return self.doc.user_token_hooks["has_vector"](self)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return True
return self.vocab.has_vector(self.c.lex.orth) return self.vocab.has_vector(self.c.lex.orth)
@property @property
@ -404,8 +402,6 @@ cdef class Token:
""" """
if "vector" in self.doc.user_token_hooks: if "vector" in self.doc.user_token_hooks:
return self.doc.user_token_hooks["vector"](self) return self.doc.user_token_hooks["vector"](self)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return self.doc.tensor[self.i]
else: else:
return self.vocab.get_vector(self.c.lex.orth) return self.vocab.get_vector(self.c.lex.orth)

View File

@ -11,7 +11,7 @@ def create_copy_from_base_model(
) -> Callable[[Language], Language]: ) -> Callable[[Language], Language]:
def copy_from_base_model(nlp): def copy_from_base_model(nlp):
if tokenizer: if tokenizer:
logger.info(f"Copying tokenizer from: {tokenizer}") logger.info("Copying tokenizer from: %s", tokenizer)
base_nlp = load_model(tokenizer) base_nlp = load_model(tokenizer)
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@ -23,7 +23,7 @@ def create_copy_from_base_model(
) )
) )
if vocab: if vocab:
logger.info(f"Copying vocab from: {vocab}") logger.info("Copying vocab from: %s", vocab)
# only reload if the vocab is from a different model # only reload if the vocab is from a different model
if tokenizer != vocab: if tokenizer != vocab:
base_nlp = load_model(vocab) base_nlp = load_model(vocab)

View File

@ -29,7 +29,7 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]: ) -> Callable[["Language"], Iterable[Example]]:
if path is None: if path is None:
raise ValueError(Errors.E913) raise ValueError(Errors.E913)
util.logger.debug(f"Loading corpus from path: {path}") util.logger.debug("Loading corpus from path: %s", path)
return Corpus( return Corpus(
path, path,
gold_preproc=gold_preproc, gold_preproc=gold_preproc,

View File

@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
frozen_components = T["frozen_components"] frozen_components = T["frozen_components"]
# Sourced components that require resume_training # Sourced components that require resume_training
resume_components = [p for p in sourced if p not in frozen_components] resume_components = [p for p in sourced if p not in frozen_components]
logger.info(f"Pipeline: {nlp.pipe_names}") logger.info("Pipeline: %s", nlp.pipe_names)
if resume_components: if resume_components:
with nlp.select_pipes(enable=resume_components): with nlp.select_pipes(enable=resume_components):
logger.info(f"Resuming training for: {resume_components}") logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer) nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further # Make sure that listeners are defined before initializing further
nlp._link_components() nlp._link_components()
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
if T["max_epochs"] == -1: if T["max_epochs"] == -1:
sample_size = 100 sample_size = 100
logger.debug( logger.debug(
f"Due to streamed train corpus, using only first {sample_size} " "Due to streamed train corpus, using only first %s examples for initialization. "
f"examples for initialization. If necessary, provide all labels " "If necessary, provide all labels in [initialize]. "
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" "More info: https://spacy.io/api/cli#init_labels",
sample_size,
) )
nlp.initialize( nlp.initialize(
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
) )
else: else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
logger.info(f"Initialized pipeline components: {nlp.pipe_names}") logger.info("Initialized pipeline components: %s", nlp.pipe_names)
# Detect components with listeners that are not frozen consistently # Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline: for name, proc in nlp.pipeline:
for listener in getattr( for listener in getattr(
@ -109,7 +110,7 @@ def init_vocab(
) -> None: ) -> None:
if lookups: if lookups:
nlp.vocab.lookups = lookups nlp.vocab.lookups = lookups
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
data_path = ensure_path(data) data_path = ensure_path(data)
if data_path is not None: if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path) lex_attrs = srsly.read_jsonl(data_path)
@ -125,11 +126,11 @@ def init_vocab(
else: else:
oov_prob = DEFAULT_OOV_PROB oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob}) nlp.vocab.cfg.update({"oov_prob": oov_prob})
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
logger.info("Created vocabulary") logger.info("Created vocabulary")
if vectors is not None: if vectors is not None:
load_vectors_into_model(nlp, vectors) load_vectors_into_model(nlp, vectors)
logger.info(f"Added vectors: {vectors}") logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical # warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@ -191,7 +192,7 @@ def init_tok2vec(
if weights_data is not None: if weights_data is not None:
layer = get_tok2vec_ref(nlp, P) layer = get_tok2vec_ref(nlp, P)
layer.from_bytes(weights_data) layer.from_bytes(weights_data)
logger.info(f"Loaded pretrained weights from {init_tok2vec}") logger.info("Loaded pretrained weights from %s", init_tok2vec)
return True return True
return False return False
@ -202,7 +203,6 @@ def convert_vectors(
*, *,
truncate: int, truncate: int,
prune: int, prune: int,
name: Optional[str] = None,
mode: str = VectorsMode.default, mode: str = VectorsMode.default,
) -> None: ) -> None:
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
@ -216,13 +216,13 @@ def convert_vectors(
nlp.vocab.deduplicate_vectors() nlp.vocab.deduplicate_vectors()
else: else:
if vectors_loc: if vectors_loc:
logger.info(f"Reading vectors from {vectors_loc}") logger.info("Reading vectors from %s", vectors_loc)
vectors_data, vector_keys, floret_settings = read_vectors( vectors_data, vector_keys, floret_settings = read_vectors(
vectors_loc, vectors_loc,
truncate, truncate,
mode=mode, mode=mode,
) )
logger.info(f"Loaded vectors from {vectors_loc}") logger.info("Loaded vectors from %s", vectors_loc)
else: else:
vectors_data, vector_keys = (None, None) vectors_data, vector_keys = (None, None)
if vector_keys is not None and mode != VectorsMode.floret: if vector_keys is not None and mode != VectorsMode.floret:
@ -241,12 +241,6 @@ def convert_vectors(
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
) )
nlp.vocab.deduplicate_vectors() nlp.vocab.deduplicate_vectors()
if name is None:
# TODO: Is this correct? Does this matter?
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune >= 1 and mode != VectorsMode.floret: if prune >= 1 and mode != VectorsMode.floret:
nlp.vocab.prune_vectors(prune) nlp.vocab.prune_vectors(prune)

View File

@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
if subdir.exists(): if subdir.exists():
try: try:
shutil.rmtree(str(subdir)) shutil.rmtree(str(subdir))
logger.debug(f"Removed existing output directory: {subdir}") logger.debug("Removed existing output directory: %s", subdir)
except Exception as e: except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e raise IOError(Errors.E901.format(path=path)) from e

View File

@ -33,6 +33,7 @@ import inspect
import pkgutil import pkgutil
import logging import logging
import socket import socket
import stat
try: try:
import cupy.random import cupy.random
@ -55,7 +56,7 @@ if TYPE_CHECKING:
# fmt: off # fmt: off
OOV_RANK = numpy.iinfo(numpy.uint64).max OOV_RANK = numpy.iinfo(numpy.uint64).max
DEFAULT_OOV_PROB = -20 DEFAULT_OOV_PROB = -20
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
# Default order of sections in the config file. Not all sections needs to exist, # Default order of sections in the config file. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order. # and additional sections are added at the end, in alphabetical order.
@ -139,8 +140,17 @@ class registry(thinc.registry):
return func return func
@classmethod @classmethod
def find(cls, registry_name: str, func_name: str) -> Callable: def find(
"""Get info about a registered function from the registry.""" cls, registry_name: str, func_name: str
) -> Dict[str, Optional[Union[str, int]]]:
"""Find information about a registered function, including the
module and path to the file it's defined in, the line number and the
docstring, if available.
registry_name (str): Name of the catalogue registry.
func_name (str): Name of the registered function.
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
"""
# We're overwriting this classmethod so we're able to provide more # We're overwriting this classmethod so we're able to provide more
# specific error messages and implement a fallback to spacy-legacy. # specific error messages and implement a fallback to spacy-legacy.
if not hasattr(cls, registry_name): if not hasattr(cls, registry_name):
@ -1030,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]:
""" """
d = Path(tempfile.mkdtemp()) d = Path(tempfile.mkdtemp())
yield d yield d
# On Windows, git clones use read-only files, which cause permission errors
# when being deleted. This forcibly fixes permissions.
def force_remove(rmfunc, path, ex):
os.chmod(path, stat.S_IWRITE)
rmfunc(path)
try: try:
shutil.rmtree(str(d)) shutil.rmtree(str(d), onerror=force_remove)
except PermissionError as e: except PermissionError as e:
warnings.warn(Warnings.W091.format(dir=d, msg=e)) warnings.warn(Warnings.W091.format(dir=d, msg=e))

View File

@ -52,7 +52,6 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors DOCS: https://spacy.io/api/vectors
""" """
cdef public object strings cdef public object strings
cdef public object name
cdef readonly object mode cdef readonly object mode
cdef public object data cdef public object data
cdef public object key2row cdef public object key2row
@ -64,14 +63,13 @@ cdef class Vectors:
cdef readonly unicode bow cdef readonly unicode bow
cdef readonly unicode eow cdef readonly unicode eow
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
"""Create a new vector store. """Create a new vector store.
strings (StringStore): The string store. strings (StringStore): The string store.
shape (tuple): Size of the table, as (# entries, # columns) shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray or cupy.ndarray): The vector data. data (numpy.ndarray or cupy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data. keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table.
mode (str): Vectors mode: "default" or "floret" (default: "default"). mode (str): Vectors mode: "default" or "floret" (default: "default").
minn (int): The floret char ngram minn (default: 0). minn (int): The floret char ngram minn (default: 0).
maxn (int): The floret char ngram maxn (default: 0). maxn (int): The floret char ngram maxn (default: 0).
@ -85,7 +83,6 @@ cdef class Vectors:
self.strings = strings self.strings = strings
if self.strings is None: if self.strings is None:
self.strings = StringStore() self.strings = StringStore()
self.name = name
if mode not in Mode.values(): if mode not in Mode.values():
raise ValueError( raise ValueError(
Errors.E202.format( Errors.E202.format(

View File

@ -11,7 +11,8 @@ from .vectors import Vectors
from pathlib import Path from pathlib import Path
def create_vocab( def create_vocab(
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... lang: Optional[str],
defaults: Any,
) -> Vocab: ... ) -> Vocab: ...
class Vocab: class Vocab:
@ -28,7 +29,6 @@ class Vocab:
strings: Optional[Union[List[str], StringStore]] = ..., strings: Optional[Union[List[str], StringStore]] = ...,
lookups: Optional[Lookups] = ..., lookups: Optional[Lookups] = ...,
oov_prob: float = ..., oov_prob: float = ...,
vectors_name: Optional[str] = ...,
writing_system: Dict[str, Any] = ..., writing_system: Dict[str, Any] = ...,
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
) -> None: ... ) -> None: ...

View File

@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None): def create_vocab(lang, defaults):
# If the spacy-lookups-data package is installed, we pre-populate the lookups # If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available # with lexeme data, if available
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None):
lex_attr_getters=lex_attrs, lex_attr_getters=lex_attrs,
writing_system=defaults.writing_system, writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
) )
@ -51,8 +50,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
oov_prob=-20., vectors_name=None, writing_system={}, oov_prob=-20., writing_system={}, get_noun_chunks=None,
get_noun_chunks=None, **deprecated_kwargs): **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -61,7 +60,6 @@ cdef class Vocab:
vice versa. vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries. lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability. oov_prob (float): Default OOV probability.
vectors_name (str): Optional name to identify the vectors table.
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
A function that yields base noun phrases used for Doc.noun_chunks. A function that yields base noun phrases used for Doc.noun_chunks.
""" """
@ -78,7 +76,7 @@ cdef class Vocab:
_ = self[string] _ = self[string]
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings) self.morphology = Morphology(self.strings)
self.vectors = Vectors(strings=self.strings, name=vectors_name) self.vectors = Vectors(strings=self.strings)
self.lookups = lookups self.lookups = lookups
self.writing_system = writing_system self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks self.get_noun_chunks = get_noun_chunks
@ -308,7 +306,7 @@ cdef class Vocab:
for key, row in self.vectors.key2row.items() for key, row in self.vectors.key2row.items()
} }
# replace vectors with deduplicated version # replace vectors with deduplicated version
self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name) self.vectors = Vectors(strings=self.strings, data=data)
for key, row in key2row.items(): for key, row in key2row.items():
self.vectors.add(key, row=row) self.vectors.add(key, row=row)
@ -358,7 +356,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
syn_keys = ops.to_numpy(syn_keys) syn_keys = ops.to_numpy(syn_keys)
remap = {} remap = {}

View File

@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.EmptyKB.v1 {id="EmptyKB"} ### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
instance. This is the default when a new entity linker component is created. instance.
| Name | Description | | Name | Description |
| ---------------------- | ----------------------------------------------------------------------------------- | | ---------------------- | ----------------------------------------------------------------------------------- |
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
### spacy.EmptyKB.v2 {id="EmptyKB"}
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
instance. This is the default when a new entity linker component is created. It
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
### spacy.KBFromFile.v1 {id="KBFromFile"} ### spacy.KBFromFile.v1 {id="KBFromFile"}
A function that reads an existing `KnowledgeBase` from file. A function that reads an existing `KnowledgeBase` from file.
@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
`CandidateGenerator` uses the text of a mention to find its potential aliases in `CandidateGenerator` uses the text of a mention to find its potential aliases in
the `KnowledgeBase`. Note that this function is case-dependent. the `KnowledgeBase`. Note that this function is case-dependent.
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
[`Span`](/api/span) objects denoting named entities, and returns a list of
plausible [`Candidate`](/api/kb/#candidate) objects per specified
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
mention to find its potential aliases in the `KnowledgeBase`. Note that this
function is case-dependent.
## Coreference {id="coref-architectures",tag="experimental"} ## Coreference {id="coref-architectures",tag="experimental"}
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to

View File

@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`.
</Infobox> </Infobox>
```bash ```bash
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
``` ```
| Name | Description | | Name | Description |
@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | | `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
@ -1492,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
### project pull {id="project-pull",tag="command"} ### project pull {id="project-pull",tag="command"}
Download all files or directories listed as `outputs` for commands, unless they Download all files or directories listed as `outputs` for commands, unless they
are not already present locally. When searching for files in the remote, `pull` are already present locally. When searching for files in the remote, `pull`
won't just look at the output path, but will also consider the **command won't just look at the output path, but will also consider the **command
string** and the **hashes of the dependencies**. For instance, let's say you've string** and the **hashes of the dependencies**. For instance, let's say you've
previously pushed a checkpoint to the remote, but now you've changed some previously pushed a checkpoint to the remote, but now you've changed some

View File

@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
| Symbol | Description | | Symbol | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------- | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| `A < B` | `A` is the immediate dependent of `B`. | | `A < B` | `A` is the immediate dependent of `B`. |
| `A > B` | `A` is the immediate head of `B`. | | `A > B` | `A` is the immediate head of `B`. |
| `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. | | `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. |
| `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. | | `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. |
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | | `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | | `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | | `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | | `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | | `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | | `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | | `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | | `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | | `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}

View File

@ -214,6 +214,7 @@ alignment mode `"strict".
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| _keyword-only_ | |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
@ -653,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the document. Yields base noun-phrase Returns a tuple of the base noun phrases in the doc, if the document has been
`Span` objects, if the document has been syntactically parsed. A base noun syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be does not permit other NPs to be nested within it so no NP-level coordination,
nested within it so no NP-level coordination, no prepositional phrases, and no no prepositional phrases, and no relative clauses.
relative clauses.
To customize the noun chunk iterator in a loaded pipeline, modify To customize the noun chunk iterator in a loaded pipeline, modify
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
> assert chunks[1].text == "another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | ------------------------------------- | | ----------- | -------------------------------------------- |
| **YIELDS** | Noun chunks in the document. ~~Span~~ | | **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
## Doc.sents {id="sents",tag="property",model="sentences"} ## Doc.sents {id="sents",tag="property",model="sentences"}
Iterate over the sentences in the document. Sentence spans have no label. Returns a tuple of the sentences in the document. Sentence spans have no label.
This property is only available when This property is only available when
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the [sentence boundaries](/usage/linguistic-features#sbd) have been set on the
@ -696,9 +696,9 @@ will raise an error otherwise.
> assert [s.root.text for s in sents] == ["is", "'s"] > assert [s.root.text for s in sents] == ["is", "'s"]
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | ----------------------------------- | | ----------- | ------------------------------------------ |
| **YIELDS** | Sentences in the document. ~~Span~~ | | **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
## Doc.has_vector {id="has_vector",tag="property",model="vectors"} ## Doc.has_vector {id="has_vector",tag="property",model="vectors"}

View File

@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config) > nlp.add_pipe("entity_linker", config=config)
> ``` > ```
| Setting | Description | | Setting | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | | `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | | `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | | `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py %%GITHUB_SPACY/spacy/pipeline/entity_linker.py

View File

@ -188,12 +188,12 @@ the character indices don't map to a valid span.
| Name | Description | | Name | Description |
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start_idx` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end_idx` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| _keyword-only_ | |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `id` | Unused. ~~Union[int, str]~~ |
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ | | `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
> assert ents[0].text == "Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------- | | ----------- | ------------------------------------------------------------ |
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` Returns a tuple of the base noun phrases in the span if the document has been
objects, if the document has been syntactically parsed. A base noun phrase, or syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
"NP chunk", is a noun phrase that does not permit other NPs to be nested within does not permit other NPs to be nested within it so no NP-level coordination,
it so no NP-level coordination, no prepositional phrases, and no relative no prepositional phrases, and no relative clauses.
clauses.
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
has not been implemeted for the given language, a `NotImplementedError` is has not been implemeted for the given language, a `NotImplementedError` is
@ -301,9 +300,9 @@ raised.
> assert chunks[0].text == "another phrase" > assert chunks[0].text == "another phrase"
> ``` > ```
| Name | Description | | Name | Description |
| ---------- | --------------------------------- | | ----------- | ---------------------------------------- |
| **YIELDS** | Noun chunks in the span. ~~Span~~ | | **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
## Span.as_doc {id="as_doc",tag="method"} ## Span.as_doc {id="as_doc",tag="method"}
@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
Returns a generator over the sentences the span belongs to. This property is Returns a tuple of the sentences the span belongs to. This property is only
only available when [sentence boundaries](/usage/linguistic-features#sbd) have available when [sentence boundaries](/usage/linguistic-features#sbd) have been
been set on the document by the `parser`, `senter`, `sentencizer` or some custom set on the document by the `parser`, `senter`, `sentencizer` or some custom
function. It will raise an error otherwise. function. It will raise an error otherwise.
If the span happens to cross sentence boundaries, all sentences the span If the span happens to cross sentence boundaries, all sentences the span
@ -541,9 +540,9 @@ overlaps with will be returned.
> assert len(span.sents) == 2 > assert len(span.sents) == 2
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------- |
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | | **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
## Attributes {id="attributes"} ## Attributes {id="attributes"}

View File

@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both
| `doc` | The document to process. ~~Doc~~ | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | The processed document. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ |
## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
Performs an update of the student pipe's model using the student's distillation
examples and sets the annotations of the teacher's distillation examples using
the teacher pipe.
Unlike other trainable pipes, the student pipe doesn't directly learn its
representations from the teacher. However, since downstream pipes that do
perform distillation expect the tok2vec annotations to be present on the
correct distillation examples, we need to ensure that they are set beforehand.
The distillation is performed on ~~Example~~ objects. The `Example.reference`
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
same orthography. Even though the reference does not need have to have gold
annotations, the teacher could adds its own annotations when necessary.
This feature is experimental.
> #### Example
>
> ```python
> teacher_pipe = teacher.add_pipe("tok2vec")
> student_pipe = student.add_pipe("tok2vec")
> optimizer = nlp.resume_training()
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
> ```
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ |
| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | Dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Tok2Vec.pipe {id="pipe",tag="method"} ## Tok2Vec.pipe {id="pipe",tag="method"}
Apply the pipe to a stream of documents. This usually happens under the hood Apply the pipe to a stream of documents. This usually happens under the hood

View File

@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="dep", options=options) > displacy.serve(doc, style="dep", options=options)
> ``` > ```
| Name | Description | | Name | Description |
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | | `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | | `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | | `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | | `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | | `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | | `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | | `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | | `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | | `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | | `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | | `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | | `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
#### Named Entity Visualizer options {id="displacy_options-ent"} #### Named Entity Visualizer options {id="displacy_options-ent"}

View File

@ -52,7 +52,6 @@ modified later.
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
| `name` | A name to identify the vectors table. ~~str~~ |
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ | | `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ | | `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |

View File

@ -27,7 +27,6 @@ Create the vocabulary.
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
| `vectors_name` | A name to identify the vectors table. ~~str~~ |
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |

View File

@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
To make them compact and fast, spaCy's small [pipeline packages](/models) (all To make them compact and fast, spaCy's small [pipeline packages](/models) (all
packages that end in `sm`) **don't ship with word vectors**, and only include packages that end in `sm`) **don't ship with word vectors**. In order to use
context-sensitive **tensors**. This means you can still use the `similarity()` `similarity()`, you need to download a larger pipeline package that includes
methods to compare documents, spans and tokens but the result won't be as vectors:
good, and individual tokens won't have any vectors assigned. So in order to use
_real_ word vectors, you need to download a larger pipeline package:
```diff ```diff
- python -m spacy download en_core_web_sm - python -m spacy download en_core_web_sm
+ python -m spacy download en_core_web_lg + python -m spacy download en_core_web_md
``` ```
In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
backing off to context-sensitive tensors from the `tok2vec` component. These
tensors do not work well for this purpose and this backoff has been removed in
spaCy v4.
</Infobox> </Infobox>
Pipeline packages that come with built-in word vectors make them available as Pipeline packages that come with built-in word vectors make them available as

View File

@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
come directly from come directly from
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
| Symbol | Description | | Symbol | Description |
| --------- | -------------------------------------------------------------------------------------------------------------------- | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| `A < B` | `A` is the immediate dependent of `B`. | | `A < B` | `A` is the immediate dependent of `B`. |
| `A > B` | `A` is the immediate head of `B`. | | `A > B` | `A` is the immediate head of `B`. |
| `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. | | `A << B` | `A` is the dependent in a chain to `B` following dep &rarr; head paths. |
| `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. | | `A >> B` | `A` is the head in a chain to `B` following head &rarr; dep paths. |
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | | `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | | `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | | `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | | `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | | `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
### Designing dependency matcher patterns {id="dependencymatcher-patterns"} ### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
The saved pipeline now includes the `"entity_ruler"` in its The saved pipeline now includes the `"entity_ruler"` in its
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
file `entityruler.jsonl` with the patterns. When you load the pipeline back in, file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
all pipeline components will be restored and deserialized including the entity pipeline components will be restored and deserialized including the entity
ruler. This lets you ship powerful pipeline packages with binary weights _and_ ruler. This lets you ship powerful pipeline packages with binary weights _and_
rules included! rules included!

View File

@ -58,12 +58,12 @@ arcs.
</Infobox> </Infobox>
| Argument | Description | | Argument | Description |
| --------- | ----------------------------------------------------------------------------------------- | | --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | | `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | | `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | | `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
For a list of all available options, see the For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options). [`displacy` API documentation](/api/top-level#displacy_options).