mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Merge branch 'master' into sync/master-into-v4
# Conflicts: # requirements.txt # spacy/pipeline/entity_linker.py # spacy/util.py # website/docs/api/entitylinker.mdx
This commit is contained in:
commit
1ea31552be
2
.github/workflows/autoblack.yml
vendored
2
.github/workflows/autoblack.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
ref: ${{ github.head_ref }}
|
ref: ${{ github.head_ref }}
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
- run: pip install black
|
- run: pip install black -c requirements.txt
|
||||||
- name: Auto-format code if needed
|
- name: Auto-format code if needed
|
||||||
run: black spacy
|
run: black spacy
|
||||||
# We can't run black --check here because that returns a non-zero excit
|
# We can't run black --check here because that returns a non-zero excit
|
||||||
|
|
|
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
||||||
Python modules. If you've built spaCy from source, you'll already have both
|
Python modules. If you've built spaCy from source, you'll already have both
|
||||||
tools installed.
|
tools installed.
|
||||||
|
|
||||||
|
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||||
|
One exception are calls to Python's `logging` functionality.
|
||||||
|
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||||
|
templates with `%s` and `%d` etc.
|
||||||
|
|
||||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
||||||
inputs:
|
inputs:
|
||||||
versionSpec: "3.8"
|
versionSpec: "3.8"
|
||||||
- script: |
|
- script: |
|
||||||
pip install black==22.3.0
|
pip install black -c requirements.txt
|
||||||
python -m black spacy --check
|
python -m black spacy --check
|
||||||
displayName: "black"
|
displayName: "black"
|
||||||
- script: |
|
- script: |
|
||||||
|
|
|
@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
flake8>=3.8.0,<6.0.0
|
flake8>=3.8.0,<6.0.0
|
||||||
hypothesis>=3.27.0,<7.0.0
|
hypothesis>=3.27.0,<7.0.0
|
||||||
mypy>=0.990,<0.1000; platform_machine != "aarch64"
|
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
||||||
|
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||||
types-mock>=0.1.1
|
types-mock>=0.1.1
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black>=22.0,<23.0
|
black==22.3.0
|
||||||
|
|
|
@ -90,9 +90,9 @@ def parse_config_overrides(
|
||||||
cli_overrides = _parse_overrides(args, is_cli=True)
|
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||||
if cli_overrides:
|
if cli_overrides:
|
||||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||||
logger.debug(f"Config overrides from CLI: {keys}")
|
logger.debug("Config overrides from CLI: %s", keys)
|
||||||
if env_overrides:
|
if env_overrides:
|
||||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||||
return {**cli_overrides, **env_overrides}
|
return {**cli_overrides, **env_overrides}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -252,7 +252,7 @@ def get_third_party_dependencies(
|
||||||
raise regerr from None
|
raise regerr from None
|
||||||
module_name = func_info.get("module") # type: ignore[attr-defined]
|
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||||
if module_name: # the code is part of a module, not a --code file
|
if module_name: # the code is part of a module, not a --code file
|
||||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||||
dependencies = []
|
dependencies = []
|
||||||
for module_name in modules:
|
for module_name in modules:
|
||||||
if module_name in distributions:
|
if module_name in distributions:
|
||||||
|
|
|
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
# in the list.
|
# in the list.
|
||||||
while commands:
|
while commands:
|
||||||
for i, cmd in enumerate(list(commands)):
|
for i, cmd in enumerate(list(commands)):
|
||||||
logger.debug(f"CMD: {cmd['name']}.")
|
logger.debug("CMD: %s.", cmd["name"])
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
if all(dep.exists() for dep in deps):
|
if all(dep.exists() for dep in deps):
|
||||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
"URL: %s for %s with command hash %s",
|
||||||
|
url,
|
||||||
|
output_path,
|
||||||
|
cmd_hash,
|
||||||
)
|
)
|
||||||
yield url, output_path
|
yield url, output_path
|
||||||
|
|
||||||
|
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
commands.pop(i)
|
commands.pop(i)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||||
else:
|
else:
|
||||||
# If we didn't break the for loop, break the while loop.
|
# If we didn't break the for loop, break the while loop.
|
||||||
break
|
break
|
||||||
|
|
|
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
||||||
remote = config["remotes"][remote]
|
remote = config["remotes"][remote]
|
||||||
storage = RemoteStorage(project_dir, remote)
|
storage = RemoteStorage(project_dir, remote)
|
||||||
for cmd in config.get("commands", []):
|
for cmd in config.get("commands", []):
|
||||||
logger.debug(f"CMD: cmd['name']")
|
logger.debug("CMD: %s", cmd["name"])
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
if any(not dep.exists() for dep in deps):
|
if any(not dep.exists() for dep in deps):
|
||||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||||
continue
|
continue
|
||||||
cmd_hash = get_command_hash(
|
cmd_hash = get_command_hash(
|
||||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||||
)
|
)
|
||||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
output_loc = project_dir / output_path
|
output_loc = project_dir / output_path
|
||||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||||
|
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
|
||||||
content_hash=get_content_hash(output_loc),
|
content_hash=get_content_hash(output_loc),
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||||
)
|
)
|
||||||
yield output_path, url
|
yield output_path, url
|
||||||
|
|
||||||
|
|
|
@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||||
"exceed 1, but found {sum}.")
|
"exceed 1, but found {sum}.")
|
||||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
|
||||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||||
"should be of equal length.")
|
"should be of equal length.")
|
||||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||||
|
@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||||
"with `displacy.serve(doc, port=port)`")
|
"with `displacy.serve(doc, port=port)`")
|
||||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||||
|
|
||||||
# v4 error strings
|
# v4 error strings
|
||||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||||
|
|
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
self._alias_index = PreshMap(nr_aliases + 1)
|
self._alias_index = PreshMap(nr_aliases + 1)
|
||||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||||
|
|
||||||
|
def is_empty(self):
|
||||||
|
return len(self) == 0
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.get_size_entities()
|
return self.get_size_entities()
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
# Punctuation stolen from Danish
|
|
||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|
||||||
|
|
||||||
|
|
||||||
class SwedishDefaults(BaseDefaults):
|
class SwedishDefaults(BaseDefaults):
|
||||||
|
|
33
spacy/lang/sv/punctuation.py
Normal file
33
spacy/lang/sv/punctuation.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
LIST_ELLIPSES
|
||||||
|
+ LIST_ICONS
|
||||||
|
+ [
|
||||||
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||||
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
_suffixes = [
|
||||||
|
suffix
|
||||||
|
for suffix in TOKENIZER_SUFFIXES
|
||||||
|
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||||
|
]
|
||||||
|
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
||||||
|
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
|
|
||||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||||
def load_lookups_data(lang, tables):
|
def load_lookups_data(lang, tables):
|
||||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||||
lookups = load_lookups(lang=lang, tables=tables)
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
|
@ -2072,7 +2072,7 @@ class Language:
|
||||||
pipe = self.get_pipe(pipe_name)
|
pipe = self.get_pipe(pipe_name)
|
||||||
pipe_cfg = self._pipe_configs[pipe_name]
|
pipe_cfg = self._pipe_configs[pipe_name]
|
||||||
if listeners:
|
if listeners:
|
||||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||||||
if len(list(listeners)) != len(pipe_listeners):
|
if len(list(listeners)) != len(pipe_listeners):
|
||||||
# The number of listeners defined in the component model doesn't
|
# The number of listeners defined in the component model doesn't
|
||||||
# match the listeners to replace, so we won't be able to update
|
# match the listeners to replace, so we won't be able to update
|
||||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
||||||
"$-": self._imm_left_sib,
|
"$-": self._imm_left_sib,
|
||||||
"$++": self._right_sib,
|
"$++": self._right_sib,
|
||||||
"$--": self._left_sib,
|
"$--": self._left_sib,
|
||||||
|
">+": self._imm_right_child,
|
||||||
|
">-": self._imm_left_child,
|
||||||
">++": self._right_child,
|
">++": self._right_child,
|
||||||
">--": self._left_child,
|
">--": self._left_child,
|
||||||
|
"<+": self._imm_right_parent,
|
||||||
|
"<-": self._imm_left_parent,
|
||||||
"<++": self._right_parent,
|
"<++": self._right_parent,
|
||||||
"<--": self._left_parent,
|
"<--": self._left_parent,
|
||||||
}
|
}
|
||||||
|
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
|
||||||
def _left_sib(self, doc, node):
|
def _left_sib(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||||
|
|
||||||
|
def _imm_right_child(self, doc, node):
|
||||||
|
for child in doc[node].children:
|
||||||
|
if child.i == node + 1:
|
||||||
|
return [doc[child.i]]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _imm_left_child(self, doc, node):
|
||||||
|
for child in doc[node].children:
|
||||||
|
if child.i == node - 1:
|
||||||
|
return [doc[child.i]]
|
||||||
|
return []
|
||||||
|
|
||||||
def _right_child(self, doc, node):
|
def _right_child(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||||
|
|
||||||
def _left_child(self, doc, node):
|
def _left_child(self, doc, node):
|
||||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||||
|
|
||||||
|
def _imm_right_parent(self, doc, node):
|
||||||
|
if doc[node].head.i == node + 1:
|
||||||
|
return [doc[node].head]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _imm_left_parent(self, doc, node):
|
||||||
|
if doc[node].head.i == node - 1:
|
||||||
|
return [doc[node].head]
|
||||||
|
return []
|
||||||
|
|
||||||
def _right_parent(self, doc, node):
|
def _right_parent(self, doc, node):
|
||||||
if doc[node].head.i > node:
|
if doc[node].head.i > node:
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
|
|
|
@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
|
||||||
return attr_values
|
return attr_values
|
||||||
|
|
||||||
|
|
||||||
|
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
|
||||||
|
# tuple order affects performance
|
||||||
|
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||||
# extensions to the matcher introduced in #3173.
|
# extensions to the matcher introduced in #3173.
|
||||||
|
|
||||||
|
@ -848,7 +853,7 @@ class _FuzzyPredicate:
|
||||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||||
self.fuzzy = int(fuzz) if fuzz else -1
|
self.fuzzy = int(fuzz) if fuzz else -1
|
||||||
self.fuzzy_compare = fuzzy_compare
|
self.fuzzy_compare = fuzzy_compare
|
||||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||||
|
|
||||||
def __call__(self, Token token):
|
def __call__(self, Token token):
|
||||||
if self.is_extension:
|
if self.is_extension:
|
||||||
|
@ -870,7 +875,7 @@ class _RegexPredicate:
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
@ -906,7 +911,7 @@ class _SetPredicate:
|
||||||
self.value = set(get_string_id(v) for v in value)
|
self.value = set(get_string_id(v) for v in value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
@ -978,7 +983,7 @@ class _ComparisonPredicate:
|
||||||
self.value = value
|
self.value = value
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
|
|
||||||
|
@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
for type_, cls in predicate_types.items():
|
for type_, cls in predicate_types.items():
|
||||||
if type_ in value:
|
if type_ in value:
|
||||||
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
|
key = _predicate_cache_key(attr, type_, value[type_])
|
||||||
if key in seen_predicates:
|
if key in seen_predicates:
|
||||||
output.append(seen_predicates[key])
|
output.append(seen_predicates[key])
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -89,6 +89,14 @@ def load_kb(
|
||||||
return kb_from_file
|
return kb_from_file
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.EmptyKB.v2")
|
||||||
|
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||||
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||||
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
|
||||||
|
return empty_kb_factory
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.EmptyKB.v1")
|
@registry.misc("spacy.EmptyKB.v1")
|
||||||
def empty_kb(
|
def empty_kb(
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
|
|
|
@ -58,6 +58,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||||
"overwrite": False,
|
"overwrite": False,
|
||||||
|
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||||
"use_gold_ents": True,
|
"use_gold_ents": True,
|
||||||
"candidates_batch_size": 1,
|
"candidates_batch_size": 1,
|
||||||
|
@ -84,6 +85,7 @@ def make_entity_linker(
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||||
overwrite: bool,
|
overwrite: bool,
|
||||||
scorer: Optional[Callable],
|
scorer: Optional[Callable],
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
@ -106,6 +108,7 @@ def make_entity_linker(
|
||||||
get_candidates_batch (
|
get_candidates_batch (
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
|
@ -147,6 +150,7 @@ def make_entity_linker(
|
||||||
entity_vector_length=entity_vector_length,
|
entity_vector_length=entity_vector_length,
|
||||||
get_candidates=get_candidates,
|
get_candidates=get_candidates,
|
||||||
get_candidates_batch=get_candidates_batch,
|
get_candidates_batch=get_candidates_batch,
|
||||||
|
generate_empty_kb=generate_empty_kb,
|
||||||
overwrite=overwrite,
|
overwrite=overwrite,
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
use_gold_ents=use_gold_ents,
|
use_gold_ents=use_gold_ents,
|
||||||
|
@ -188,6 +192,7 @@ class EntityLinker(TrainablePipe):
|
||||||
get_candidates_batch: Callable[
|
get_candidates_batch: Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
],
|
],
|
||||||
|
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||||
overwrite: bool = False,
|
overwrite: bool = False,
|
||||||
scorer: Optional[Callable] = entity_linker_score,
|
scorer: Optional[Callable] = entity_linker_score,
|
||||||
use_gold_ents: bool,
|
use_gold_ents: bool,
|
||||||
|
@ -212,6 +217,7 @@ class EntityLinker(TrainablePipe):
|
||||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||||
Iterable[Candidate]]
|
Iterable[Candidate]]
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
overwrite (bool): Whether to overwrite existing non-empty annotations.
|
overwrite (bool): Whether to overwrite existing non-empty annotations.
|
||||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||||
|
@ -219,6 +225,7 @@ class EntityLinker(TrainablePipe):
|
||||||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||||
|
save_activations (bool): save model activations in Doc when annotating.
|
||||||
DOCS: https://spacy.io/api/entitylinker#init
|
DOCS: https://spacy.io/api/entitylinker#init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -235,6 +242,7 @@ class EntityLinker(TrainablePipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self.labels_discard = list(labels_discard)
|
self.labels_discard = list(labels_discard)
|
||||||
|
# how many neighbour sentences to take into account
|
||||||
self.n_sents = n_sents
|
self.n_sents = n_sents
|
||||||
self.incl_prior = incl_prior
|
self.incl_prior = incl_prior
|
||||||
self.incl_context = incl_context
|
self.incl_context = incl_context
|
||||||
|
@ -242,9 +250,7 @@ class EntityLinker(TrainablePipe):
|
||||||
self.get_candidates_batch = get_candidates_batch
|
self.get_candidates_batch = get_candidates_batch
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
# how many neighbour sentences to take into account
|
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||||
# create an empty KB by default
|
|
||||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
|
||||||
self.scorer = scorer
|
self.scorer = scorer
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
self.candidates_batch_size = candidates_batch_size
|
self.candidates_batch_size = candidates_batch_size
|
||||||
|
@ -266,7 +272,7 @@ class EntityLinker(TrainablePipe):
|
||||||
# Raise an error if the knowledge base is not initialized.
|
# Raise an error if the knowledge base is not initialized.
|
||||||
if self.kb is None:
|
if self.kb is None:
|
||||||
raise ValueError(Errors.E1018.format(name=self.name))
|
raise ValueError(Errors.E1018.format(name=self.name))
|
||||||
if len(self.kb) == 0:
|
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
|
|
|
@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
||||||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(12311)
|
||||||
|
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
|
||||||
|
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
|
||||||
|
tokens = sv_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
||||||
("the", "brown", "$--", 0),
|
("the", "brown", "$--", 0),
|
||||||
("brown", "the", "$--", 1),
|
("brown", "the", "$--", 1),
|
||||||
("brown", "brown", "$--", 0),
|
("brown", "brown", "$--", 0),
|
||||||
|
("over", "jumped", "<+", 0),
|
||||||
|
("quick", "fox", "<+", 0),
|
||||||
|
("the", "quick", "<+", 0),
|
||||||
|
("brown", "fox", "<+", 1),
|
||||||
("quick", "fox", "<++", 1),
|
("quick", "fox", "<++", 1),
|
||||||
("quick", "over", "<++", 0),
|
("quick", "over", "<++", 0),
|
||||||
("over", "jumped", "<++", 0),
|
("over", "jumped", "<++", 0),
|
||||||
("the", "fox", "<++", 2),
|
("the", "fox", "<++", 2),
|
||||||
|
("brown", "fox", "<-", 0),
|
||||||
|
("fox", "over", "<-", 0),
|
||||||
|
("the", "over", "<-", 0),
|
||||||
|
("over", "jumped", "<-", 1),
|
||||||
("brown", "fox", "<--", 0),
|
("brown", "fox", "<--", 0),
|
||||||
("fox", "jumped", "<--", 0),
|
("fox", "jumped", "<--", 0),
|
||||||
("fox", "over", "<--", 1),
|
("fox", "over", "<--", 1),
|
||||||
|
("fox", "brown", ">+", 0),
|
||||||
|
("over", "fox", ">+", 0),
|
||||||
|
("over", "the", ">+", 0),
|
||||||
|
("jumped", "over", ">+", 1),
|
||||||
("jumped", "over", ">++", 1),
|
("jumped", "over", ">++", 1),
|
||||||
("fox", "lazy", ">++", 0),
|
("fox", "lazy", ">++", 0),
|
||||||
("over", "the", ">++", 0),
|
("over", "the", ">++", 0),
|
||||||
|
("jumped", "over", ">-", 0),
|
||||||
|
("fox", "quick", ">-", 0),
|
||||||
|
("brown", "quick", ">-", 0),
|
||||||
|
("fox", "brown", ">-", 1),
|
||||||
("brown", "fox", ">--", 0),
|
("brown", "fox", ">--", 0),
|
||||||
("fox", "brown", ">--", 1),
|
("fox", "brown", ">--", 1),
|
||||||
("jumped", "fox", ">--", 1),
|
("jumped", "fox", ">--", 1),
|
||||||
|
|
|
@ -353,6 +353,9 @@ def test_kb_default(nlp):
|
||||||
"""Test that the default (empty) KB is loaded upon construction"""
|
"""Test that the default (empty) KB is loaded upon construction"""
|
||||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||||
assert len(entity_linker.kb) == 0
|
assert len(entity_linker.kb) == 0
|
||||||
|
with pytest.raises(ValueError, match="E139"):
|
||||||
|
# this raises an error because the KB is empty
|
||||||
|
entity_linker.validate_kb()
|
||||||
assert entity_linker.kb.get_size_entities() == 0
|
assert entity_linker.kb.get_size_entities() == 0
|
||||||
assert entity_linker.kb.get_size_aliases() == 0
|
assert entity_linker.kb.get_size_aliases() == 0
|
||||||
# 64 is the default value from pipeline.entity_linker
|
# 64 is the default value from pipeline.entity_linker
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
from typing import Callable
|
from pathlib import Path
|
||||||
|
from typing import Callable, Iterable, Any, Dict
|
||||||
|
|
||||||
from spacy import util
|
import srsly
|
||||||
from spacy.util import ensure_path, registry, load_model_from_config
|
|
||||||
|
from spacy import util, Errors
|
||||||
|
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
|
||||||
|
|
||||||
[components.entity_linker]
|
[components.entity_linker]
|
||||||
factory = "entity_linker"
|
factory = "entity_linker"
|
||||||
|
|
||||||
|
[components.entity_linker.generate_empty_kb]
|
||||||
|
@misc = "kb_test.CustomEmptyKB.v1"
|
||||||
|
|
||||||
[initialize]
|
[initialize]
|
||||||
|
|
||||||
[initialize.components]
|
[initialize.components]
|
||||||
|
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
|
||||||
[initialize.components.entity_linker]
|
[initialize.components.entity_linker]
|
||||||
|
|
||||||
[initialize.components.entity_linker.kb_loader]
|
[initialize.components.entity_linker.kb_loader]
|
||||||
@misc = "spacy.CustomKB.v1"
|
@misc = "kb_test.CustomKB.v1"
|
||||||
entity_vector_length = 342
|
entity_vector_length = 342
|
||||||
custom_field = 666
|
custom_field = 666
|
||||||
"""
|
"""
|
||||||
|
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
|
||||||
super().__init__(vocab, entity_vector_length)
|
super().__init__(vocab, entity_vector_length)
|
||||||
self.custom_field = custom_field
|
self.custom_field = custom_field
|
||||||
|
|
||||||
@registry.misc("spacy.CustomKB.v1")
|
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
|
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||||
|
path = ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
path.mkdir(parents=True)
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
|
||||||
|
def serialize_custom_fields(file_path: Path) -> None:
|
||||||
|
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||||
|
|
||||||
|
serialize = {
|
||||||
|
"contents": lambda p: self.write_contents(p),
|
||||||
|
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||||
|
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||||
|
}
|
||||||
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||||
|
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||||
|
path = ensure_path(path)
|
||||||
|
if not path.exists():
|
||||||
|
raise ValueError(Errors.E929.format(loc=path))
|
||||||
|
if not path.is_dir():
|
||||||
|
raise ValueError(Errors.E928.format(loc=path))
|
||||||
|
|
||||||
|
def deserialize_custom_fields(file_path: Path) -> None:
|
||||||
|
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||||
|
|
||||||
|
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||||
|
"contents": lambda p: self.read_contents(p),
|
||||||
|
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||||
|
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||||
|
}
|
||||||
|
util.from_disk(path, deserialize, exclude)
|
||||||
|
|
||||||
|
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||||
|
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||||
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||||
|
return SubInMemoryLookupKB(
|
||||||
|
vocab=vocab,
|
||||||
|
entity_vector_length=entity_vector_length,
|
||||||
|
custom_field=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return empty_kb_factory
|
||||||
|
|
||||||
|
@registry.misc("kb_test.CustomKB.v1")
|
||||||
def custom_kb(
|
def custom_kb(
|
||||||
entity_vector_length: int, custom_field: int
|
entity_vector_length: int, custom_field: int
|
||||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||||
def custom_kb_factory(vocab):
|
def custom_kb_factory(vocab):
|
||||||
kb = SubInMemoryLookupKB(
|
kb = SubInMemoryLookupKB(
|
||||||
vocab=vocab,
|
vocab=vocab,
|
||||||
|
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
|
||||||
nlp2 = util.load_model_from_path(tmp_dir)
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||||
# After IO, the KB is the standard one
|
# After IO, the KB is the standard one
|
||||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||||
assert entity_linker2.kb.entity_vector_length == 342
|
assert entity_linker2.kb.entity_vector_length == 342
|
||||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
assert entity_linker2.kb.custom_field == 666
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
import srsly
|
||||||
from typer.testing import CliRunner
|
from typer.testing import CliRunner
|
||||||
from spacy.tokens import DocBin, Doc
|
from spacy.tokens import DocBin, Doc
|
||||||
|
|
||||||
|
@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
||||||
# Instead of checking specific wording of the output, which may change,
|
# Instead of checking specific wording of the output, which may change,
|
||||||
# we'll check that this section of the debug output is present.
|
# we'll check that this section of the debug output is present.
|
||||||
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||||
|
|
||||||
|
|
||||||
|
# project tests
|
||||||
|
|
||||||
|
SAMPLE_PROJECT = {
|
||||||
|
"title": "Sample project",
|
||||||
|
"description": "This is a project for testing",
|
||||||
|
"assets": [
|
||||||
|
{
|
||||||
|
"dest": "assets/spacy-readme.md",
|
||||||
|
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
|
||||||
|
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dest": "assets/citation.cff",
|
||||||
|
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
|
||||||
|
"checksum": "c996bfd80202d480eb2e592369714e5e",
|
||||||
|
"extra": True,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"commands": [
|
||||||
|
{
|
||||||
|
"name": "ok",
|
||||||
|
"help": "print ok",
|
||||||
|
"script": ["python -c \"print('okokok')\""],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "create",
|
||||||
|
"help": "make a file",
|
||||||
|
"script": ["touch abc.txt"],
|
||||||
|
"outputs": ["abc.txt"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "clean",
|
||||||
|
"help": "remove test file",
|
||||||
|
"script": ["rm abc.txt"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def project_dir():
|
||||||
|
with make_tempdir() as pdir:
|
||||||
|
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
|
||||||
|
yield pdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_document(project_dir):
|
||||||
|
readme_path = project_dir / "README.md"
|
||||||
|
assert not readme_path.exists(), "README already exists"
|
||||||
|
result = CliRunner().invoke(
|
||||||
|
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert readme_path.is_file()
|
||||||
|
text = readme_path.read_text("utf-8")
|
||||||
|
assert SAMPLE_PROJECT["description"] in text
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_assets(project_dir):
|
||||||
|
asset_dir = project_dir / "assets"
|
||||||
|
assert not asset_dir.exists(), "Assets dir is already present"
|
||||||
|
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
|
||||||
|
# check that extras work
|
||||||
|
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_run(project_dir):
|
||||||
|
# make sure dry run works
|
||||||
|
test_file = project_dir / "abc.txt"
|
||||||
|
result = CliRunner().invoke(
|
||||||
|
app, ["project", "run", "--dry", "create", str(project_dir)]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert not test_file.is_file()
|
||||||
|
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert test_file.is_file()
|
||||||
|
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "okokok" in result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"options",
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
# "--sparse",
|
||||||
|
"--branch v3",
|
||||||
|
"--repo https://github.com/explosion/projects --branch v3",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_project_clone(options):
|
||||||
|
with make_tempdir() as workspace:
|
||||||
|
out = workspace / "project"
|
||||||
|
target = "benchmarks/ner_conll03"
|
||||||
|
if not options:
|
||||||
|
options = []
|
||||||
|
else:
|
||||||
|
options = options.split()
|
||||||
|
result = CliRunner().invoke(
|
||||||
|
app, ["project", "clone", target, *options, str(out)]
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert (out / "README.md").is_file()
|
||||||
|
|
||||||
|
|
||||||
|
def test_project_push_pull(project_dir):
|
||||||
|
proj = dict(SAMPLE_PROJECT)
|
||||||
|
remote = "xyz"
|
||||||
|
|
||||||
|
with make_tempdir() as remote_dir:
|
||||||
|
proj["remotes"] = {remote: str(remote_dir)}
|
||||||
|
proj_text = srsly.yaml_dumps(proj)
|
||||||
|
(project_dir / "project.yml").write_text(proj_text)
|
||||||
|
|
||||||
|
test_file = project_dir / "abc.txt"
|
||||||
|
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert test_file.is_file()
|
||||||
|
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert not test_file.exists()
|
||||||
|
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert test_file.is_file()
|
||||||
|
|
|
@ -98,7 +98,7 @@ def assert_sents_error(doc):
|
||||||
|
|
||||||
def warn_error(proc_name, proc, docs, e):
|
def warn_error(proc_name, proc, docs, e):
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
logger.warning(f"Trouble with component {proc_name}.")
|
logger.warning("Trouble with component %s.", proc_name)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -11,7 +11,7 @@ def create_copy_from_base_model(
|
||||||
) -> Callable[[Language], Language]:
|
) -> Callable[[Language], Language]:
|
||||||
def copy_from_base_model(nlp):
|
def copy_from_base_model(nlp):
|
||||||
if tokenizer:
|
if tokenizer:
|
||||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||||
base_nlp = load_model(tokenizer)
|
base_nlp = load_model(tokenizer)
|
||||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||||
|
@ -23,7 +23,7 @@ def create_copy_from_base_model(
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if vocab:
|
if vocab:
|
||||||
logger.info(f"Copying vocab from: {vocab}")
|
logger.info("Copying vocab from: %s", vocab)
|
||||||
# only reload if the vocab is from a different model
|
# only reload if the vocab is from a different model
|
||||||
if tokenizer != vocab:
|
if tokenizer != vocab:
|
||||||
base_nlp = load_model(vocab)
|
base_nlp = load_model(vocab)
|
||||||
|
|
|
@ -29,7 +29,7 @@ def create_docbin_reader(
|
||||||
) -> Callable[["Language"], Iterable[Example]]:
|
) -> Callable[["Language"], Iterable[Example]]:
|
||||||
if path is None:
|
if path is None:
|
||||||
raise ValueError(Errors.E913)
|
raise ValueError(Errors.E913)
|
||||||
util.logger.debug(f"Loading corpus from path: {path}")
|
util.logger.debug("Loading corpus from path: %s", path)
|
||||||
return Corpus(
|
return Corpus(
|
||||||
path,
|
path,
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
|
|
|
@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
frozen_components = T["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
# Sourced components that require resume_training
|
# Sourced components that require resume_training
|
||||||
resume_components = [p for p in sourced if p not in frozen_components]
|
resume_components = [p for p in sourced if p not in frozen_components]
|
||||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
logger.info("Pipeline: %s", nlp.pipe_names)
|
||||||
if resume_components:
|
if resume_components:
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
logger.info(f"Resuming training for: {resume_components}")
|
logger.info("Resuming training for: %s", resume_components)
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
# Make sure that listeners are defined before initializing further
|
# Make sure that listeners are defined before initializing further
|
||||||
nlp._link_components()
|
nlp._link_components()
|
||||||
|
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
if T["max_epochs"] == -1:
|
if T["max_epochs"] == -1:
|
||||||
sample_size = 100
|
sample_size = 100
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Due to streamed train corpus, using only first {sample_size} "
|
"Due to streamed train corpus, using only first %s examples for initialization. "
|
||||||
f"examples for initialization. If necessary, provide all labels "
|
"If necessary, provide all labels in [initialize]. "
|
||||||
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
"More info: https://spacy.io/api/cli#init_labels",
|
||||||
|
sample_size,
|
||||||
)
|
)
|
||||||
nlp.initialize(
|
nlp.initialize(
|
||||||
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
|
||||||
# Detect components with listeners that are not frozen consistently
|
# Detect components with listeners that are not frozen consistently
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
for listener in getattr(
|
for listener in getattr(
|
||||||
|
@ -109,7 +110,7 @@ def init_vocab(
|
||||||
) -> None:
|
) -> None:
|
||||||
if lookups:
|
if lookups:
|
||||||
nlp.vocab.lookups = lookups
|
nlp.vocab.lookups = lookups
|
||||||
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
|
||||||
data_path = ensure_path(data)
|
data_path = ensure_path(data)
|
||||||
if data_path is not None:
|
if data_path is not None:
|
||||||
lex_attrs = srsly.read_jsonl(data_path)
|
lex_attrs = srsly.read_jsonl(data_path)
|
||||||
|
@ -125,11 +126,11 @@ def init_vocab(
|
||||||
else:
|
else:
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
|
||||||
logger.info("Created vocabulary")
|
logger.info("Created vocabulary")
|
||||||
if vectors is not None:
|
if vectors is not None:
|
||||||
load_vectors_into_model(nlp, vectors)
|
load_vectors_into_model(nlp, vectors)
|
||||||
logger.info(f"Added vectors: {vectors}")
|
logger.info("Added vectors: %s", vectors)
|
||||||
# warn if source model vectors are not identical
|
# warn if source model vectors are not identical
|
||||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||||
|
@ -191,7 +192,7 @@ def init_tok2vec(
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
layer = get_tok2vec_ref(nlp, P)
|
layer = get_tok2vec_ref(nlp, P)
|
||||||
layer.from_bytes(weights_data)
|
layer.from_bytes(weights_data)
|
||||||
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
logger.info("Loaded pretrained weights from %s", init_tok2vec)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -215,13 +216,13 @@ def convert_vectors(
|
||||||
nlp.vocab.deduplicate_vectors()
|
nlp.vocab.deduplicate_vectors()
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
logger.info(f"Reading vectors from {vectors_loc}")
|
logger.info("Reading vectors from %s", vectors_loc)
|
||||||
vectors_data, vector_keys, floret_settings = read_vectors(
|
vectors_data, vector_keys, floret_settings = read_vectors(
|
||||||
vectors_loc,
|
vectors_loc,
|
||||||
truncate,
|
truncate,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
)
|
)
|
||||||
logger.info(f"Loaded vectors from {vectors_loc}")
|
logger.info("Loaded vectors from %s", vectors_loc)
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None and mode != VectorsMode.floret:
|
if vector_keys is not None and mode != VectorsMode.floret:
|
||||||
|
|
|
@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
|
||||||
if subdir.exists():
|
if subdir.exists():
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(str(subdir))
|
shutil.rmtree(str(subdir))
|
||||||
logger.debug(f"Removed existing output directory: {subdir}")
|
logger.debug("Removed existing output directory: %s", subdir)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise IOError(Errors.E901.format(path=path)) from e
|
raise IOError(Errors.E901.format(path=path)) from e
|
||||||
|
|
|
@ -33,6 +33,7 @@ import inspect
|
||||||
import pkgutil
|
import pkgutil
|
||||||
import logging
|
import logging
|
||||||
import socket
|
import socket
|
||||||
|
import stat
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cupy.random
|
import cupy.random
|
||||||
|
@ -55,7 +56,7 @@ if TYPE_CHECKING:
|
||||||
# fmt: off
|
# fmt: off
|
||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||||
DEFAULT_OOV_PROB = -20
|
DEFAULT_OOV_PROB = -20
|
||||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||||
|
|
||||||
# Default order of sections in the config file. Not all sections needs to exist,
|
# Default order of sections in the config file. Not all sections needs to exist,
|
||||||
# and additional sections are added at the end, in alphabetical order.
|
# and additional sections are added at the end, in alphabetical order.
|
||||||
|
@ -139,8 +140,17 @@ class registry(thinc.registry):
|
||||||
return func
|
return func
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def find(cls, registry_name: str, func_name: str) -> Callable:
|
def find(
|
||||||
"""Get info about a registered function from the registry."""
|
cls, registry_name: str, func_name: str
|
||||||
|
) -> Dict[str, Optional[Union[str, int]]]:
|
||||||
|
"""Find information about a registered function, including the
|
||||||
|
module and path to the file it's defined in, the line number and the
|
||||||
|
docstring, if available.
|
||||||
|
|
||||||
|
registry_name (str): Name of the catalogue registry.
|
||||||
|
func_name (str): Name of the registered function.
|
||||||
|
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
|
||||||
|
"""
|
||||||
# We're overwriting this classmethod so we're able to provide more
|
# We're overwriting this classmethod so we're able to provide more
|
||||||
# specific error messages and implement a fallback to spacy-legacy.
|
# specific error messages and implement a fallback to spacy-legacy.
|
||||||
if not hasattr(cls, registry_name):
|
if not hasattr(cls, registry_name):
|
||||||
|
@ -1028,11 +1038,19 @@ def make_tempdir() -> Generator[Path, None, None]:
|
||||||
|
|
||||||
YIELDS (Path): The path of the temp directory.
|
YIELDS (Path): The path of the temp directory.
|
||||||
"""
|
"""
|
||||||
|
d = Path(tempfile.mkdtemp())
|
||||||
|
yield d
|
||||||
|
|
||||||
|
# On Windows, git clones use read-only files, which cause permission errors
|
||||||
|
# when being deleted. This forcibly fixes permissions.
|
||||||
|
def force_remove(rmfunc, path, ex):
|
||||||
|
os.chmod(path, stat.S_IWRITE)
|
||||||
|
rmfunc(path)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with tempfile.TemporaryDirectory() as td:
|
shutil.rmtree(str(d), onerror=force_remove)
|
||||||
yield Path(td)
|
|
||||||
except PermissionError as e:
|
except PermissionError as e:
|
||||||
warnings.warn(Warnings.W091.format(dir=td, msg=e))
|
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
||||||
|
|
||||||
|
|
||||||
def is_cwd(path: Union[Path, str]) -> bool:
|
def is_cwd(path: Union[Path, str]) -> bool:
|
||||||
|
|
|
@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
|
|
||||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
|
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
|
||||||
|
|
||||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||||
instance. This is the default when a new entity linker component is created.
|
instance.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||||
|
|
||||||
|
### spacy.EmptyKB.v2 {id="EmptyKB"}
|
||||||
|
|
||||||
|
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||||
|
instance. This is the default when a new entity linker component is created. It
|
||||||
|
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
|
||||||
|
|
||||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
||||||
|
|
||||||
A function that reads an existing `KnowledgeBase` from file.
|
A function that reads an existing `KnowledgeBase` from file.
|
||||||
|
@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
||||||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||||
the `KnowledgeBase`. Note that this function is case-dependent.
|
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||||
|
|
||||||
|
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
|
||||||
|
|
||||||
|
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
|
||||||
|
[`Span`](/api/span) objects denoting named entities, and returns a list of
|
||||||
|
plausible [`Candidate`](/api/kb/#candidate) objects per specified
|
||||||
|
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
|
||||||
|
mention to find its potential aliases in the `KnowledgeBase`. Note that this
|
||||||
|
function is case-dependent.
|
||||||
|
|
||||||
## Coreference {id="coref-architectures",tag="experimental"}
|
## Coreference {id="coref-architectures",tag="experimental"}
|
||||||
|
|
||||||
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
||||||
|
|
|
@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
|
||||||
### project pull {id="project-pull",tag="command"}
|
### project pull {id="project-pull",tag="command"}
|
||||||
|
|
||||||
Download all files or directories listed as `outputs` for commands, unless they
|
Download all files or directories listed as `outputs` for commands, unless they
|
||||||
are not already present locally. When searching for files in the remote, `pull`
|
are already present locally. When searching for files in the remote, `pull`
|
||||||
won't just look at the output path, but will also consider the **command
|
won't just look at the output path, but will also consider the **command
|
||||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||||
previously pushed a checkpoint to the remote, but now you've changed some
|
previously pushed a checkpoint to the remote, but now you've changed some
|
||||||
|
|
|
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
||||||
come directly from
|
come directly from
|
||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||||
|
|
||||||
| Symbol | Description |
|
| Symbol | Description |
|
||||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||||
| `A > B` | `A` is the immediate head of `B`. |
|
| `A > B` | `A` is the immediate head of `B`. |
|
||||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
|
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
|
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
|
||||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("entity_linker", config=config)
|
> nlp.add_pipe("entity_linker", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ |
|
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||||
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||||
|
|
|
@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
|
||||||
> displacy.serve(doc, style="dep", options=options)
|
> displacy.serve(doc, style="dep", options=options)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||||
|
|
||||||
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
||||||
|
|
||||||
|
|
|
@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
||||||
come directly from
|
come directly from
|
||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||||
|
|
||||||
| Symbol | Description |
|
| Symbol | Description |
|
||||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||||
| `A > B` | `A` is the immediate head of `B`. |
|
| `A > B` | `A` is the immediate head of `B`. |
|
||||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||||
|
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
|
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
|
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
|
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||||
|
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||||
|
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||||
|
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||||
|
|
||||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||||
|
|
||||||
|
@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
|
||||||
|
|
||||||
The saved pipeline now includes the `"entity_ruler"` in its
|
The saved pipeline now includes the `"entity_ruler"` in its
|
||||||
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
|
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
|
||||||
file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
|
file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
|
||||||
all pipeline components will be restored and deserialized – including the entity
|
pipeline components will be restored and deserialized – including the entity
|
||||||
ruler. This lets you ship powerful pipeline packages with binary weights _and_
|
ruler. This lets you ship powerful pipeline packages with binary weights _and_
|
||||||
rules included!
|
rules included!
|
||||||
|
|
||||||
|
|
|
@ -58,12 +58,12 @@ arcs.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
| Argument | Description |
|
| Argument | Description |
|
||||||
| --------- | ----------------------------------------------------------------------------------------- |
|
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||||
|
|
||||||
For a list of all available options, see the
|
For a list of all available options, see the
|
||||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||||
|
|
Loading…
Reference in New Issue
Block a user