mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-22 18:12:00 +03:00
Merge branch 'v4' into feature/tok2vec-distill-method
This commit is contained in:
commit
35fcf2eb38
5
.github/azure-steps.yml
vendored
5
.github/azure-steps.yml
vendored
|
@ -69,6 +69,11 @@ steps:
|
|||
# displayName: 'Test skip re-download (#12188)'
|
||||
# condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
# - script: |
|
||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
# displayName: 'Test download_url in info CLI'
|
||||
# condition: eq(variables['python_version'] '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
|
|
2
.github/workflows/autoblack.yml
vendored
2
.github/workflows/autoblack.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black
|
||||
- run: pip install black -c requirements.txt
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
|
|
|
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
|||
Python modules. If you've built spaCy from source, you'll already have both
|
||||
tools installed.
|
||||
|
||||
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||
One exception are calls to Python's `logging` functionality.
|
||||
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||
templates with `%s` and `%d` etc.
|
||||
|
||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
|||
inputs:
|
||||
versionSpec: "3.8"
|
||||
- script: |
|
||||
pip install black==22.3.0
|
||||
pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
displayName: "black"
|
||||
- script: |
|
||||
|
|
|
@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64"
|
||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black>=22.0,<23.0
|
||||
black==22.3.0
|
||||
|
|
|
@ -19,6 +19,7 @@ import os
|
|||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||
from ..errors import RENAMED_LANGUAGE_CODES
|
||||
from .. import about
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -89,9 +90,9 @@ def parse_config_overrides(
|
|||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
logger.debug("Config overrides from CLI: %s", keys)
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
|
@ -134,6 +135,16 @@ def _parse_override(value: Any) -> Any:
|
|||
return str(value)
|
||||
|
||||
|
||||
def _handle_renamed_language_codes(lang: Optional[str]) -> None:
|
||||
# Throw error for renamed language codes in v4
|
||||
if lang in RENAMED_LANGUAGE_CODES:
|
||||
msg.fail(
|
||||
title="Renamed language code",
|
||||
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def load_project_config(
|
||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
||||
) -> Dict[str, Any]:
|
||||
|
|
|
@ -7,7 +7,7 @@ import re
|
|||
import sys
|
||||
import itertools
|
||||
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
|
@ -112,6 +112,10 @@ def convert(
|
|||
input_path = Path(input_path)
|
||||
if not msg:
|
||||
msg = Printer(no_print=silent)
|
||||
|
||||
# Throw error for renamed language codes in v4
|
||||
_handle_renamed_language_codes(lang)
|
||||
|
||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||
doc_files = []
|
||||
for input_loc in walk_directory(input_path, converter):
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
import importlib.metadata
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from .download import get_model_filename, get_latest_version
|
||||
|
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
|||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
dist = importlib.metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
return data["url"]
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
|
|
|
@ -12,7 +12,7 @@ from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
|
|||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
from ._util import string_to_list, import_code
|
||||
from ._util import string_to_list, import_code, _handle_renamed_language_codes
|
||||
|
||||
|
||||
ROOT = Path(__file__).parent / "templates"
|
||||
|
@ -43,7 +43,7 @@ class InitValues:
|
|||
def init_config_cli(
|
||||
# fmt: off
|
||||
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
|
||||
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
|
||||
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
|
||||
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
|
||||
|
@ -169,6 +169,10 @@ def init_config(
|
|||
msg = Printer(no_print=silent)
|
||||
with TEMPLATE_PATH.open("r") as f:
|
||||
template = Template(f.read())
|
||||
|
||||
# Throw error for renamed language codes in v4
|
||||
_handle_renamed_language_codes(lang)
|
||||
|
||||
# Filter out duplicates since tok2vec and transformer are added by template
|
||||
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
|
||||
defaults = RECOMMENDATIONS["__default__"]
|
||||
|
|
|
@ -9,7 +9,7 @@ from .. import util
|
|||
from ..training.initialize import init_nlp, convert_vectors
|
||||
from ..language import Language
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
|
||||
|
||||
|
||||
@init_cli.command("vectors")
|
||||
|
@ -21,7 +21,6 @@ def init_vectors_cli(
|
|||
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
|
||||
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
# fmt: on
|
||||
|
@ -31,6 +30,10 @@ def init_vectors_cli(
|
|||
a model with vectors.
|
||||
"""
|
||||
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
|
||||
# Throw error for renamed language codes in v4
|
||||
_handle_renamed_language_codes(lang)
|
||||
|
||||
msg.info(f"Creating blank nlp object for language '{lang}'")
|
||||
nlp = util.get_lang_class(lang)()
|
||||
if jsonl_loc is not None:
|
||||
|
@ -40,7 +43,6 @@ def init_vectors_cli(
|
|||
vectors_loc,
|
||||
truncate=truncate,
|
||||
prune=prune,
|
||||
name=name,
|
||||
mode=mode,
|
||||
)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
|
|
|
@ -252,7 +252,7 @@ def get_third_party_dependencies(
|
|||
raise regerr from None
|
||||
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||
if module_name: # the code is part of a module, not a --code file
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||
dependencies = []
|
||||
for module_name in modules:
|
||||
if module_name in distributions:
|
||||
|
|
|
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
logger.debug(f"CMD: {cmd['name']}.")
|
||||
logger.debug("CMD: %s.", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
logger.debug(
|
||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
||||
"URL: %s for %s with command hash %s",
|
||||
url,
|
||||
output_path,
|
||||
cmd_hash,
|
||||
)
|
||||
yield url, output_path
|
||||
|
||||
|
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
commands.pop(i)
|
||||
break
|
||||
else:
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
|||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
logger.debug(f"CMD: cmd['name']")
|
||||
logger.debug("CMD: %s", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||
continue
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||
|
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
|
|||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
logger.debug(
|
||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||
)
|
||||
yield output_path, url
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
|
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
|
|
@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
|
||||
# v4 error strings
|
||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||
|
@ -962,6 +961,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"reference and predicted docs.")
|
||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||
|
||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||
|
||||
# fmt: on
|
||||
|
||||
|
|
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
def is_empty(self):
|
||||
return len(self) == 0
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
|
|||
|
||||
|
||||
class Icelandic(Language):
|
||||
lang = "is"
|
||||
lang = "isl"
|
||||
Defaults = IcelandicDefaults
|
||||
|
||||
|
|
@ -3,10 +3,10 @@ from ...language import Language
|
|||
|
||||
class MultiLanguage(Language):
|
||||
"""Language class to be used for models that support multiple languages.
|
||||
This module allows models to specify their language ID as 'xx'.
|
||||
This module allows models to specify their language ID as 'mul'.
|
||||
"""
|
||||
|
||||
lang = "xx"
|
||||
lang = "mul"
|
||||
|
||||
|
||||
__all__ = ["MultiLanguage"]
|
|
@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class SwedishDefaults(BaseDefaults):
|
||||
|
|
33
spacy/lang/sv/punctuation.py
Normal file
33
spacy/lang/sv/punctuation.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = [
|
||||
suffix
|
||||
for suffix in TOKENIZER_SUFFIXES
|
||||
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||
]
|
||||
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
@ -174,8 +174,7 @@ class Language:
|
|||
if not isinstance(vocab, Vocab) and vocab is not True:
|
||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
|
||||
vocab = create_vocab(self.lang, self.Defaults)
|
||||
else:
|
||||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||||
|
@ -229,7 +228,6 @@ class Language:
|
|||
"width": self.vocab.vectors_length,
|
||||
"vectors": len(self.vocab.vectors),
|
||||
"keys": self.vocab.vectors.n_keys,
|
||||
"name": self.vocab.vectors.name,
|
||||
"mode": self.vocab.vectors.mode,
|
||||
}
|
||||
self._meta["labels"] = dict(self.pipe_labels)
|
||||
|
@ -1248,17 +1246,12 @@ class Language:
|
|||
component_cfg[name].setdefault("drop", drop)
|
||||
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||||
for name, proc in self.pipeline:
|
||||
# ignore statements are used here because mypy ignores hasattr
|
||||
if name not in exclude and hasattr(proc, "update"):
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore
|
||||
if sgd not in (None, False):
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
and proc.model not in (True, False, None)
|
||||
):
|
||||
proc.finish_update(sgd)
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
):
|
||||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||||
if name in annotates:
|
||||
for doc, eg in zip(
|
||||
_pipe(
|
||||
|
@ -1271,6 +1264,17 @@ class Language:
|
|||
examples,
|
||||
):
|
||||
eg.predicted = doc
|
||||
# Only finish the update after all component updates are done. Some
|
||||
# components may share weights (such as tok2vec) and we only want
|
||||
# to apply weight updates after all gradients are accumulated.
|
||||
for name, proc in self.pipeline:
|
||||
if (
|
||||
name not in exclude
|
||||
and isinstance(proc, ty.TrainableComponent)
|
||||
and proc.is_trainable
|
||||
):
|
||||
proc.finish_update(sgd)
|
||||
|
||||
return losses
|
||||
|
||||
def rehearse(
|
||||
|
@ -2068,7 +2072,7 @@ class Language:
|
|||
pipe = self.get_pipe(pipe_name)
|
||||
pipe_cfg = self._pipe_configs[pipe_name]
|
||||
if listeners:
|
||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
||||
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||||
if len(list(listeners)) != len(pipe_listeners):
|
||||
# The number of listeners defined in the component model doesn't
|
||||
# match the listeners to replace, so we won't be able to update
|
||||
|
@ -2191,9 +2195,6 @@ class Language:
|
|||
if path.exists():
|
||||
data = srsly.read_json(path)
|
||||
self.meta.update(data)
|
||||
# self.meta always overrides meta["vectors"] with the metadata
|
||||
# from self.vocab.vectors, so set the name directly
|
||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
||||
|
||||
def deserialize_vocab(path: Path) -> None:
|
||||
if path.exists():
|
||||
|
@ -2262,9 +2263,6 @@ class Language:
|
|||
def deserialize_meta(b):
|
||||
data = srsly.json_loads(b)
|
||||
self.meta.update(data)
|
||||
# self.meta always overrides meta["vectors"] with the metadata
|
||||
# from self.vocab.vectors, so set the name directly
|
||||
self.vocab.vectors.name = data.get("vectors", {}).get("name")
|
||||
|
||||
deserializers: Dict[str, Callable[[bytes], Any]] = {}
|
||||
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
|
||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
|||
"$-": self._imm_left_sib,
|
||||
"$++": self._right_sib,
|
||||
"$--": self._left_sib,
|
||||
">+": self._imm_right_child,
|
||||
">-": self._imm_left_child,
|
||||
">++": self._right_child,
|
||||
">--": self._left_child,
|
||||
"<+": self._imm_right_parent,
|
||||
"<-": self._imm_left_parent,
|
||||
"<++": self._right_parent,
|
||||
"<--": self._left_parent,
|
||||
}
|
||||
|
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
|
|||
def _left_sib(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _imm_right_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _imm_left_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _right_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||
|
||||
def _imm_right_parent(self, doc, node):
|
||||
if doc[node].head.i == node + 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _imm_left_parent(self, doc, node):
|
||||
if doc[node].head.i == node - 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _right_parent(self, doc, node):
|
||||
if doc[node].head.i > node:
|
||||
return [doc[node].head]
|
||||
|
|
|
@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
|
|||
return attr_values
|
||||
|
||||
|
||||
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
|
||||
# tuple order affects performance
|
||||
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
|
||||
|
||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||
# extensions to the matcher introduced in #3173.
|
||||
|
||||
|
@ -848,7 +853,7 @@ class _FuzzyPredicate:
|
|||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
|
@ -870,7 +875,7 @@ class _RegexPredicate:
|
|||
self.value = re.compile(value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -906,7 +911,7 @@ class _SetPredicate:
|
|||
self.value = set(get_string_id(v) for v in value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -978,7 +983,7 @@ class _ComparisonPredicate:
|
|||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
|||
if isinstance(value, dict):
|
||||
for type_, cls in predicate_types.items():
|
||||
if type_ in value:
|
||||
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
|
||||
key = _predicate_cache_key(attr, type_, value[type_])
|
||||
if key in seen_predicates:
|
||||
output.append(seen_predicates[key])
|
||||
else:
|
||||
|
|
|
@ -89,6 +89,14 @@ def load_kb(
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v2")
|
||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
|
|
|
@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
|||
cdef np.ndarray step_actions
|
||||
|
||||
scores = []
|
||||
while sizes.states >= 1:
|
||||
while sizes.states >= 1 and (actions is None or len(actions) > 0):
|
||||
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
|
||||
step_actions = actions[0] if actions is not None else None
|
||||
assert step_actions is None or step_actions.size == sizes.states, \
|
||||
f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})"
|
||||
with nogil:
|
||||
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
|
||||
if actions is None:
|
||||
|
|
|
@ -13,7 +13,6 @@ from ..kb import KnowledgeBase, Candidate
|
|||
from ..ml import empty_kb
|
||||
from ..tokens import Doc, Span
|
||||
from .pipe import deserialize_config
|
||||
from .legacy.entity_linker import EntityLinker_v1
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ..language import Language
|
||||
from ..vocab import Vocab
|
||||
|
@ -28,9 +27,6 @@ ActivationsT = Dict[str, Union[List[Ragged], List[str]]]
|
|||
|
||||
KNOWLEDGE_BASE_IDS = "kb_ids"
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.EntityLinker.v2"
|
||||
|
@ -61,7 +57,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||
"overwrite": True,
|
||||
"overwrite": False,
|
||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
"candidates_batch_size": 1,
|
||||
|
@ -88,6 +85,7 @@ def make_entity_linker(
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
|
@ -110,6 +108,7 @@ def make_entity_linker(
|
|||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -120,6 +119,12 @@ def make_entity_linker(
|
|||
"""
|
||||
|
||||
if not model.attrs.get("include_span_maker", False):
|
||||
try:
|
||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
||||
except:
|
||||
raise ImportError(
|
||||
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
|
||||
)
|
||||
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
|
||||
return EntityLinker_v1(
|
||||
nlp.vocab,
|
||||
|
@ -145,6 +150,7 @@ def make_entity_linker(
|
|||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
get_candidates_batch=get_candidates_batch,
|
||||
generate_empty_kb=generate_empty_kb,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
|
@ -186,7 +192,8 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
candidates_batch_size: int,
|
||||
|
@ -210,12 +217,15 @@ class EntityLinker(TrainablePipe):
|
|||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
overwrite (bool): Whether to overwrite existing non-empty annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
|
||||
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
|
||||
|
@ -232,6 +242,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
# how many neighbour sentences to take into account
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
|
@ -239,9 +250,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.get_candidates_batch = get_candidates_batch
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
|
@ -263,7 +272,7 @@ class EntityLinker(TrainablePipe):
|
|||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
from .entity_linker import EntityLinker_v1
|
||||
|
||||
__all__ = ["EntityLinker_v1"]
|
|
@ -1,422 +0,0 @@
|
|||
# This file is present to provide a prior version of the EntityLinker component
|
||||
# for backwards compatability. For details see #9669.
|
||||
|
||||
from typing import Optional, Iterable, Callable, Dict, Union, List, Any
|
||||
from thinc.types import Floats2d
|
||||
from pathlib import Path
|
||||
from itertools import islice
|
||||
import srsly
|
||||
import random
|
||||
from thinc.api import CosineDistance, Model, Optimizer
|
||||
from thinc.api import set_dropout_rate
|
||||
import warnings
|
||||
|
||||
from ...kb import KnowledgeBase, Candidate
|
||||
from ...ml import empty_kb
|
||||
from ...tokens import Doc, Span
|
||||
from ..pipe import deserialize_config
|
||||
from ..trainable_pipe import TrainablePipe
|
||||
from ...language import Language
|
||||
from ...vocab import Vocab
|
||||
from ...training import Example, validate_examples, validate_get_examples
|
||||
from ...errors import Errors, Warnings
|
||||
from ...util import SimpleFrozenList
|
||||
from ... import util
|
||||
from ...scorer import Scorer
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
|
||||
|
||||
def entity_linker_score(examples, **kwargs):
|
||||
return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
|
||||
|
||||
|
||||
class EntityLinker_v1(TrainablePipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker
|
||||
"""
|
||||
|
||||
NIL = "NIL" # string used to refer to a non-existing link
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
model: Model,
|
||||
name: str = "entity_linker",
|
||||
*,
|
||||
labels_discard: Iterable[str],
|
||||
n_sents: int,
|
||||
incl_prior: bool,
|
||||
incl_context: bool,
|
||||
entity_vector_length: int,
|
||||
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
) -> None:
|
||||
"""Initialize an entity linker.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||
n_sents (int): The number of neighbouring sentences to take into account.
|
||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||
incl_context (bool): Whether or not to include the local context in the model.
|
||||
entity_vector_length (int): Size of encoding vectors in the KB.
|
||||
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
|
||||
produces a list of candidates, given a certain knowledge base and a textual mention.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
DOCS: https://spacy.io/api/entitylinker#init
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
self.get_candidates = get_candidates
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.scorer = scorer
|
||||
|
||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||
"""Define the KB of this pipe by providing a function that will
|
||||
create it using this object's vocab."""
|
||||
if not callable(kb_loader):
|
||||
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
|
||||
|
||||
self.kb = kb_loader(self.vocab)
|
||||
|
||||
def validate_kb(self) -> None:
|
||||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
|
||||
):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
|
||||
Note that providing this argument, will overwrite all data accumulated in the current KB.
|
||||
Use this only when loading a KB as-such from file.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#initialize
|
||||
"""
|
||||
validate_get_examples(get_examples, "EntityLinker_v1.initialize")
|
||||
if kb_loader is not None:
|
||||
self.set_kb(kb_loader)
|
||||
self.validate_kb()
|
||||
nO = self.kb.entity_vector_length
|
||||
doc_sample = []
|
||||
vector_sample = []
|
||||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.x)
|
||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(
|
||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||
)
|
||||
|
||||
def update(
|
||||
self,
|
||||
examples: Iterable[Example],
|
||||
*,
|
||||
drop: float = 0.0,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
) -> Dict[str, float]:
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
updating the pipe's model. Delegates to predict and get_loss.
|
||||
|
||||
examples (Iterable[Example]): A batch of Example objects.
|
||||
drop (float): The dropout rate.
|
||||
sgd (thinc.api.Optimizer): The optimizer.
|
||||
losses (Dict[str, float]): Optional record of the loss during training.
|
||||
Updated using the component name as the key.
|
||||
RETURNS (Dict[str, float]): The updated losses dictionary.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#update
|
||||
"""
|
||||
self.validate_kb()
|
||||
if losses is None:
|
||||
losses = {}
|
||||
losses.setdefault(self.name, 0.0)
|
||||
if not examples:
|
||||
return losses
|
||||
validate_examples(examples, "EntityLinker_v1.update")
|
||||
sentence_docs = []
|
||||
for eg in examples:
|
||||
sentences = [s for s in eg.reference.sents]
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
for ent in eg.reference.ents:
|
||||
# KB ID of the first token is the same as the whole span
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
try:
|
||||
# find the sentence in the list of sentences.
|
||||
sent_index = sentences.index(ent.sent)
|
||||
except AttributeError:
|
||||
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
||||
raise RuntimeError(Errors.E030) from None
|
||||
# get n previous sentences, if there are any
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
# get n posterior sentences, or as many < n as there are
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
# get token positions
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
# append that span as a doc to training
|
||||
sent_doc = eg.predicted[start_token:end_token].as_doc()
|
||||
sentence_docs.append(sent_doc)
|
||||
set_dropout_rate(self.model, drop)
|
||||
if not sentence_docs:
|
||||
warnings.warn(Warnings.W093.format(name="Entity Linker"))
|
||||
return losses
|
||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||
loss, d_scores = self.get_loss(
|
||||
sentence_encodings=sentence_encodings, examples=examples
|
||||
)
|
||||
bp_context(d_scores)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss
|
||||
return losses
|
||||
|
||||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||
validate_examples(examples, "EntityLinker_v1.get_loss")
|
||||
entity_encodings = []
|
||||
for eg in examples:
|
||||
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
|
||||
for ent in eg.reference.ents:
|
||||
kb_id = kb_ids[ent.start]
|
||||
if kb_id:
|
||||
entity_encoding = self.kb.get_vector(kb_id)
|
||||
entity_encodings.append(entity_encoding)
|
||||
entity_encodings = self.model.ops.asarray2f(entity_encodings)
|
||||
if sentence_encodings.shape != entity_encodings.shape:
|
||||
err = Errors.E147.format(
|
||||
method="get_loss", msg="gold entities do not match up"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
|
||||
loss = self.distance.get_loss(sentence_encodings, entity_encodings)
|
||||
loss = loss / len(entity_encodings)
|
||||
return float(loss), gradients
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[str]:
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
Returns the KB IDs for each entity in each doc, including NIL if there is
|
||||
no prediction.
|
||||
|
||||
docs (Iterable[Doc]): The documents to predict.
|
||||
RETURNS (List[str]): The models prediction for each document.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#predict
|
||||
"""
|
||||
self.validate_kb()
|
||||
entity_count = 0
|
||||
final_kb_ids: List[str] = []
|
||||
if not docs:
|
||||
return final_kb_ids
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
for i, doc in enumerate(docs):
|
||||
sentences = [s for s in doc.sents]
|
||||
if len(doc) > 0:
|
||||
# Looping through each entity (TODO: rewrite)
|
||||
for ent in doc.ents:
|
||||
sent = ent.sent
|
||||
sent_index = sentences.index(sent)
|
||||
assert sent_index >= 0
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
xp = self.model.ops.xp
|
||||
if self.incl_context:
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
sentence_norm = xp.linalg.norm(sentence_encoding_t)
|
||||
entity_count += 1
|
||||
if ent.label_ in self.labels_discard:
|
||||
# ignoring this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
else:
|
||||
candidates = list(self.get_candidates(self.kb, ent))
|
||||
if not candidates:
|
||||
# no prediction possible for this entity - setting to NIL
|
||||
final_kb_ids.append(self.NIL)
|
||||
elif len(candidates) == 1:
|
||||
# shortcut for efficiency reasons: take the 1 candidate
|
||||
final_kb_ids.append(candidates[0].entity_)
|
||||
else:
|
||||
random.shuffle(candidates)
|
||||
# set all prior probabilities to 0 if incl_prior=False
|
||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
||||
if not self.incl_prior:
|
||||
prior_probs = xp.asarray([0.0 for _ in candidates])
|
||||
scores = prior_probs
|
||||
# add in similarity from the context
|
||||
if self.incl_context:
|
||||
entity_encodings = xp.asarray(
|
||||
[c.entity_vector for c in candidates]
|
||||
)
|
||||
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
|
||||
if len(entity_encodings) != len(prior_probs):
|
||||
raise RuntimeError(
|
||||
Errors.E147.format(
|
||||
method="predict",
|
||||
msg="vectors not of equal length",
|
||||
)
|
||||
)
|
||||
# cosine similarity
|
||||
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
|
||||
sentence_norm * entity_norm
|
||||
)
|
||||
if sims.shape != prior_probs.shape:
|
||||
raise ValueError(Errors.E161)
|
||||
scores = prior_probs + sims - (prior_probs * sims)
|
||||
best_index = scores.argmax().item()
|
||||
best_candidate = candidates[best_index]
|
||||
final_kb_ids.append(best_candidate.entity_)
|
||||
if not (len(final_kb_ids) == entity_count):
|
||||
err = Errors.E147.format(
|
||||
method="predict", msg="result variables not of equal length"
|
||||
)
|
||||
raise RuntimeError(err)
|
||||
return final_kb_ids
|
||||
|
||||
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
|
||||
"""Modify a batch of documents, using pre-computed scores.
|
||||
|
||||
docs (Iterable[Doc]): The documents to modify.
|
||||
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#set_annotations
|
||||
"""
|
||||
count_ents = len([ent for doc in docs for ent in doc.ents])
|
||||
if count_ents != len(kb_ids):
|
||||
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
|
||||
i = 0
|
||||
overwrite = self.cfg["overwrite"]
|
||||
for doc in docs:
|
||||
for ent in doc.ents:
|
||||
kb_id = kb_ids[i]
|
||||
i += 1
|
||||
for token in ent:
|
||||
if token.ent_kb_id == 0 or overwrite:
|
||||
token.ent_kb_id_ = kb_id
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()):
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#to_bytes
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
serialize = {}
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||||
serialize["kb"] = self.kb.to_bytes
|
||||
serialize["model"] = self.model.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (TrainablePipe): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#from_bytes
|
||||
"""
|
||||
self._validate_serialization_attrs()
|
||||
|
||||
def load_model(b):
|
||||
try:
|
||||
self.model.from_bytes(b)
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize = {}
|
||||
if hasattr(self, "cfg") and self.cfg is not None:
|
||||
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
|
||||
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
|
||||
deserialize["model"] = load_model
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["kb"] = lambda p: self.kb.to_disk(p)
|
||||
serialize["model"] = lambda p: self.model.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityLinker_v1":
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (EntityLinker): The modified EntityLinker object.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker#from_disk
|
||||
"""
|
||||
|
||||
def load_model(p):
|
||||
try:
|
||||
with p.open("rb") as infile:
|
||||
self.model.from_bytes(infile.read())
|
||||
except AttributeError:
|
||||
raise ValueError(Errors.E149) from None
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {}
|
||||
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
|
||||
deserialize["kb"] = lambda p: self.kb.from_disk(p)
|
||||
deserialize["model"] = load_model
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def rehearse(self, examples, *, sgd=None, losses=None, **config):
|
||||
raise NotImplementedError
|
||||
|
||||
def add_label(self, label):
|
||||
raise NotImplementedError
|
|
@ -21,10 +21,6 @@ from ..scorer import Scorer
|
|||
from ..training import validate_examples, validate_get_examples
|
||||
from ..util import registry
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = True
|
||||
BACKWARD_EXTEND = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -102,8 +98,8 @@ class Morphologizer(Tagger):
|
|||
model: Model,
|
||||
name: str = "morphologizer",
|
||||
*,
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
overwrite: bool = False,
|
||||
extend: bool = False,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
|
@ -113,6 +109,8 @@ class Morphologizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
extend (bool): Whether to extend existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attributes "pos" and "morph" and
|
||||
Scorer.score_token_attr_per_feat for the attribute "morph".
|
||||
|
|
|
@ -10,9 +10,6 @@ from ..language import Language
|
|||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
|
||||
# see #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
@Language.factory(
|
||||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
|
@ -52,13 +49,14 @@ class Sentencizer(Pipe):
|
|||
name="sentencizer",
|
||||
*,
|
||||
punct_chars=None,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
overwrite=False,
|
||||
scorer=senter_score,
|
||||
):
|
||||
"""Initialize the sentencizer.
|
||||
|
||||
punct_chars (list): Punctuation characters to split on. Will be
|
||||
serialized with the nlp object.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@ from ..training import validate_examples, validate_get_examples
|
|||
from ..util import registry
|
||||
from .. import util
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
|
@ -83,7 +81,7 @@ class SentenceRecognizer(Tagger):
|
|||
model,
|
||||
name="senter",
|
||||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
overwrite=False,
|
||||
scorer=senter_score,
|
||||
save_activations: bool = False,
|
||||
):
|
||||
|
@ -93,6 +91,7 @@ class SentenceRecognizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the attribute "sents".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
|
|
@ -27,9 +27,6 @@ from .. import util
|
|||
|
||||
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
||||
|
||||
# See #9050
|
||||
BACKWARD_OVERWRITE = False
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -99,7 +96,7 @@ class Tagger(TrainablePipe):
|
|||
model,
|
||||
name="tagger",
|
||||
*,
|
||||
overwrite=BACKWARD_OVERWRITE,
|
||||
overwrite=False,
|
||||
scorer=tagger_score,
|
||||
neg_prefix="!",
|
||||
save_activations: bool = False,
|
||||
|
@ -110,6 +107,7 @@ class Tagger(TrainablePipe):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
overwrite (bool): Whether to overwrite existing annotations.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_token_attr for the attribute "tag".
|
||||
save_activations (bool): save model activations in Doc when annotating.
|
||||
|
|
|
@ -36,6 +36,11 @@ from ..errors import Errors, Warnings
|
|||
from .. import util
|
||||
|
||||
|
||||
# TODO: Remove when we switch to Cython 3.
|
||||
cdef extern from "<algorithm>" namespace "std" nogil:
|
||||
bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except +
|
||||
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
|
||||
|
@ -253,8 +258,8 @@ class Parser(TrainablePipe):
|
|||
# batch uniform length. Since we do not have a gold standard
|
||||
# sequence, we use the teacher's predictions as the gold
|
||||
# standard.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states = self._init_batch(teacher_pipe, student_docs, max_moves)
|
||||
max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
|
||||
states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves)
|
||||
else:
|
||||
states = self.moves.init_batch(student_docs)
|
||||
|
||||
|
@ -265,12 +270,12 @@ class Parser(TrainablePipe):
|
|||
# gradients of the student's transition distributions relative to the
|
||||
# teacher's distributions.
|
||||
|
||||
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
|
||||
max_moves=max_moves)
|
||||
student_inputs = TransitionModelInputs(docs=student_docs,
|
||||
states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
actions = _states_diff_to_actions(states, student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
|
||||
moves=self.moves, actions=actions)
|
||||
states=states, moves=teacher_pipe.moves, actions=actions)
|
||||
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||
|
@ -522,7 +527,7 @@ class Parser(TrainablePipe):
|
|||
set_dropout_rate(self.model, 0.0)
|
||||
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||
actions = states2actions(student_states)
|
||||
actions = _states_to_actions(student_states)
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
|
||||
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
|
||||
|
||||
|
@ -642,7 +647,7 @@ class Parser(TrainablePipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
return self
|
||||
|
||||
def _init_batch(self, teacher_step_model, docs, max_length):
|
||||
def _init_batch_from_teacher(self, teacher_pipe, docs, max_length):
|
||||
"""Make a square batch of length equal to the shortest transition
|
||||
sequence or a cap. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
|
@ -651,10 +656,12 @@ class Parser(TrainablePipe):
|
|||
_init_gold_batch, this version uses a teacher model to generate the
|
||||
cut sequences."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch(docs)
|
||||
TransitionSystem moves = teacher_pipe.moves
|
||||
|
||||
# Start with the same heuristic as in supervised training: exclude
|
||||
# docs that are within the maximum length.
|
||||
all_states = moves.init_batch(docs)
|
||||
states = []
|
||||
to_cut = []
|
||||
for state, doc in zip(all_states, docs):
|
||||
|
@ -663,18 +670,28 @@ class Parser(TrainablePipe):
|
|||
states.append(state)
|
||||
else:
|
||||
to_cut.append(state)
|
||||
|
||||
if not to_cut:
|
||||
return states
|
||||
|
||||
# Parse the states that are too long with the teacher's parsing model.
|
||||
teacher_inputs = TransitionModelInputs(docs=docs, moves=moves,
|
||||
states=[state.copy() for state in to_cut])
|
||||
(teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs)
|
||||
|
||||
# Step through the teacher's actions and store every state after
|
||||
# each multiple of max_length.
|
||||
teacher_actions = _states_to_actions(teacher_states)
|
||||
while to_cut:
|
||||
states.extend(state.copy() for state in to_cut)
|
||||
# Move states forward max_length actions.
|
||||
length = 0
|
||||
while to_cut and length < max_length:
|
||||
teacher_scores = teacher_step_model.predict(to_cut)
|
||||
self.transition_states(to_cut, teacher_scores)
|
||||
# States that are completed do not need further cutting.
|
||||
to_cut = [state for state in to_cut if not state.is_final()]
|
||||
length += 1
|
||||
return states
|
||||
for step_actions in teacher_actions[:max_length]:
|
||||
to_cut = moves.apply_actions(to_cut, step_actions)
|
||||
teacher_actions = teacher_actions[max_length:]
|
||||
|
||||
if len(teacher_actions) < max_length:
|
||||
break
|
||||
|
||||
return states
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
|
@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs):
|
|||
model.attrs[key] = value
|
||||
|
||||
|
||||
def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
||||
def _states_to_actions(states: List[StateClass]) -> List[Ints1d]:
|
||||
cdef int step
|
||||
cdef StateClass state
|
||||
cdef StateC* c_state
|
||||
|
@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
|||
actions.append(numpy.array(step_actions, dtype="i"))
|
||||
|
||||
return actions
|
||||
|
||||
def _states_diff_to_actions(
|
||||
before_states: List[StateClass],
|
||||
after_states: List[StateClass]
|
||||
) -> List[Ints1d]:
|
||||
"""
|
||||
Return for two sets of states the actions to go from the first set of
|
||||
states to the second set of states. The histories of the first set of
|
||||
states must be a prefix of the second set of states.
|
||||
"""
|
||||
cdef StateClass before_state, after_state
|
||||
cdef StateC* c_state_before
|
||||
cdef StateC* c_state_after
|
||||
|
||||
assert len(before_states) == len(after_states)
|
||||
|
||||
# Check invariant: before states histories must be prefixes of after states.
|
||||
for before_state, after_state in zip(before_states, after_states):
|
||||
c_state_before = before_state.c
|
||||
c_state_after = after_state.c
|
||||
|
||||
assert equal(c_state_before.history.begin(), c_state_before.history.end(),
|
||||
c_state_after.history.begin())
|
||||
|
||||
actions = []
|
||||
while True:
|
||||
step = len(actions)
|
||||
|
||||
step_actions = []
|
||||
for before_state, after_state in zip(before_states, after_states):
|
||||
c_state_before = before_state.c
|
||||
c_state_after = after_state.c
|
||||
if step < c_state_after.history.size() - c_state_before.history.size():
|
||||
step_actions.append(c_state_after.history[c_state_before.history.size() + step])
|
||||
|
||||
# We are done if we have exhausted all histories.
|
||||
if len(step_actions) == 0:
|
||||
break
|
||||
|
||||
actions.append(numpy.array(step_actions, dtype="i"))
|
||||
|
||||
return actions
|
||||
|
|
|
@ -104,7 +104,7 @@ class Scorer:
|
|||
def __init__(
|
||||
self,
|
||||
nlp: Optional["Language"] = None,
|
||||
default_lang: str = "xx",
|
||||
default_lang: str = "mul",
|
||||
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
|
||||
**cfg,
|
||||
) -> None:
|
||||
|
|
|
@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
|
|||
|
||||
| Fixture | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------- |
|
||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
|
||||
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `mul` language class. |
|
||||
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
|
||||
| `en_vocab` | Creates an instance of the English `Vocab`. |
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
|
|||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return get_lang_class("xx")().tokenizer
|
||||
return get_lang_class("mul")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -243,8 +243,8 @@ def id_tokenizer():
|
|||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def is_tokenizer():
|
||||
return get_lang_class("is")().tokenizer
|
||||
def isl_tokenizer():
|
||||
return get_lang_class("isl")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -496,8 +496,8 @@ def vi_tokenizer():
|
|||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def xx_tokenizer():
|
||||
return get_lang_class("xx")().tokenizer
|
||||
def mul_tokenizer():
|
||||
return get_lang_class("mul")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
|
@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops
|
|||
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
|
||||
from spacy.attrs import SENT_START, TAG
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.xx import MultiLanguage
|
||||
from spacy.lang.mul import MultiLanguage
|
||||
from spacy.language import Language
|
||||
from spacy.lexeme import Lexeme
|
||||
from spacy.tokens import Doc, Span, SpanGroup, Token
|
||||
|
|
|
@ -175,6 +175,18 @@ def test_modify_span_group(doc):
|
|||
assert group[0].label == doc.vocab.strings["TEST"]
|
||||
|
||||
|
||||
def test_char_span_attributes(doc):
|
||||
label = "LABEL"
|
||||
kb_id = "KB_ID"
|
||||
span_id = "SPAN_ID"
|
||||
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
|
||||
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
|
||||
assert span1.text == span2.text
|
||||
assert span1.label_ == span2.label_ == label
|
||||
assert span1.kb_id_ == span2.kb_id_ == kb_id
|
||||
assert span1.id_ == span2.id_ == span_id
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -354,6 +366,14 @@ def test_spans_by_character(doc):
|
|||
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||
)
|
||||
|
||||
# Span.char_span + alignment mode "contract"
|
||||
span2 = doc[0:2].char_span(
|
||||
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||
)
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_long_text(is_tokenizer):
|
||||
def test_long_text(isl_tokenizer):
|
||||
# Excerpt: European Convention on Human Rights
|
||||
text = """
|
||||
hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja
|
||||
|
@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með
|
|||
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
|
||||
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
|
||||
"""
|
||||
tokens = is_tokenizer(text)
|
||||
tokens = isl_tokenizer(text)
|
||||
assert len(tokens) == 120
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_ordinal_number(is_tokenizer):
|
||||
def test_ordinal_number(isl_tokenizer):
|
||||
text = "10. desember 1948"
|
||||
tokens = is_tokenizer(text)
|
||||
tokens = isl_tokenizer(text)
|
||||
assert len(tokens) == 3
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
|
||||
IS_BASIC_TOKENIZATION_TESTS = [
|
||||
ISL_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Enginn maður skal sæta pyndingum eða ómannlegri eða "
|
||||
"vanvirðandi meðferð eða refsingu. ",
|
||||
|
@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
|
||||
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
|
||||
tokens = is_tokenizer(text)
|
||||
@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
|
||||
def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
|
||||
tokens = isl_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_long_text(xx_tokenizer):
|
||||
def test_long_text(mul_tokenizer):
|
||||
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
|
||||
text = """
|
||||
Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
|
||||
|
@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu
|
|||
Sääʹmteʹǧǧ.
|
||||
"""
|
||||
|
||||
tokens = xx_tokenizer(text)
|
||||
tokens = mul_tokenizer(text)
|
||||
assert len(tokens) == 179
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
|
||||
XX_BASIC_TOKENIZATION_TESTS = [
|
||||
MUL_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
|
||||
[
|
||||
|
@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [
|
|||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
|
||||
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
|
||||
tokens = xx_tokenizer(text)
|
||||
@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
|
||||
def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
|
||||
tokens = mul_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
|
@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
|||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.issue(12311)
|
||||
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
|
||||
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -7,10 +7,10 @@ from spacy.util import get_lang_class
|
|||
# excluded: ja, ko, th, vi, zh
|
||||
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
|
||||
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
|
||||
"mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||
"tr", "tt", "uk", "ur", "xx", "yo"]
|
||||
"tr", "tt", "uk", "ur", "yo"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
|||
("the", "brown", "$--", 0),
|
||||
("brown", "the", "$--", 1),
|
||||
("brown", "brown", "$--", 0),
|
||||
("over", "jumped", "<+", 0),
|
||||
("quick", "fox", "<+", 0),
|
||||
("the", "quick", "<+", 0),
|
||||
("brown", "fox", "<+", 1),
|
||||
("quick", "fox", "<++", 1),
|
||||
("quick", "over", "<++", 0),
|
||||
("over", "jumped", "<++", 0),
|
||||
("the", "fox", "<++", 2),
|
||||
("brown", "fox", "<-", 0),
|
||||
("fox", "over", "<-", 0),
|
||||
("the", "over", "<-", 0),
|
||||
("over", "jumped", "<-", 1),
|
||||
("brown", "fox", "<--", 0),
|
||||
("fox", "jumped", "<--", 0),
|
||||
("fox", "over", "<--", 1),
|
||||
("fox", "brown", ">+", 0),
|
||||
("over", "fox", ">+", 0),
|
||||
("over", "the", ">+", 0),
|
||||
("jumped", "over", ">+", 1),
|
||||
("jumped", "over", ">++", 1),
|
||||
("fox", "lazy", ">++", 0),
|
||||
("over", "the", ">++", 0),
|
||||
("jumped", "over", ">-", 0),
|
||||
("fox", "quick", ">-", 0),
|
||||
("brown", "quick", ">-", 0),
|
||||
("fox", "brown", ">-", 1),
|
||||
("brown", "fox", ">--", 0),
|
||||
("fox", "brown", ">--", 1),
|
||||
("jumped", "fox", ">--", 1),
|
||||
|
|
61
spacy/tests/parser/test_model.py
Normal file
61
spacy/tests/parser/test_model.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import numpy
|
||||
import pytest
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml.tb_framework import TransitionModelInputs
|
||||
from spacy.training import Example
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"They trade mortgage-backed securities.",
|
||||
{
|
||||
"heads": [1, 1, 4, 4, 5, 1, 1],
|
||||
"deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"I like London and Berlin.",
|
||||
{
|
||||
"heads": [1, 1, 1, 2, 2, 1],
|
||||
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp_parser():
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe("parser")
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations["deps"]:
|
||||
parser.add_label(dep)
|
||||
nlp.initialize()
|
||||
|
||||
return nlp, parser
|
||||
|
||||
|
||||
def test_incorrect_number_of_actions(nlp_parser):
|
||||
nlp, parser = nlp_parser
|
||||
doc = nlp.make_doc("test")
|
||||
|
||||
# Too many actions for the number of docs
|
||||
with pytest.raises(AssertionError):
|
||||
parser.model.predict(
|
||||
TransitionModelInputs(
|
||||
docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")]
|
||||
)
|
||||
)
|
||||
|
||||
# Too few actions for the number of docs
|
||||
with pytest.raises(AssertionError):
|
||||
parser.model.predict(
|
||||
TransitionModelInputs(
|
||||
docs=[doc, doc],
|
||||
moves=parser.moves,
|
||||
actions=[numpy.array([0], dtype="i")],
|
||||
)
|
||||
)
|
|
@ -623,7 +623,9 @@ def test_is_distillable():
|
|||
assert ner.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
|
||||
def test_distill(max_moves):
|
||||
teacher = English()
|
||||
teacher_ner = teacher.add_pipe("ner")
|
||||
train_examples = []
|
||||
|
@ -641,6 +643,7 @@ def test_distill():
|
|||
|
||||
student = English()
|
||||
student_ner = student.add_pipe("ner")
|
||||
student_ner.cfg["update_with_oracle_cut_size"] = max_moves
|
||||
student_ner.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_ner.label_data
|
||||
)
|
||||
|
|
|
@ -463,7 +463,9 @@ def test_is_distillable():
|
|||
assert parser.is_distillable
|
||||
|
||||
|
||||
def test_distill():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("max_moves", [0, 1, 5, 100])
|
||||
def test_distill(max_moves):
|
||||
teacher = English()
|
||||
teacher_parser = teacher.add_pipe("parser")
|
||||
train_examples = []
|
||||
|
@ -481,6 +483,7 @@ def test_distill():
|
|||
|
||||
student = English()
|
||||
student_parser = student.add_pipe("parser")
|
||||
student_parser.cfg["update_with_oracle_cut_size"] = max_moves
|
||||
student_parser.initialize(
|
||||
get_examples=lambda: train_examples, labels=teacher_parser.label_data
|
||||
)
|
||||
|
|
|
@ -54,9 +54,11 @@ def test_annotates_on_update():
|
|||
return AssertSents(name)
|
||||
|
||||
class AssertSents:
|
||||
model = None
|
||||
is_trainable = True
|
||||
|
||||
def __init__(self, name, **cfg):
|
||||
self.name = name
|
||||
pass
|
||||
|
||||
def __call__(self, doc):
|
||||
if not doc.has_annotation("SENT_START"):
|
||||
|
@ -64,10 +66,16 @@ def test_annotates_on_update():
|
|||
return doc
|
||||
|
||||
def update(self, examples, *, drop=0.0, sgd=None, losses=None):
|
||||
losses.setdefault(self.name, 0.0)
|
||||
|
||||
for example in examples:
|
||||
if not example.predicted.has_annotation("SENT_START"):
|
||||
raise ValueError("No sents")
|
||||
return {}
|
||||
|
||||
return losses
|
||||
|
||||
def finish_update(self, sgd=None):
|
||||
pass
|
||||
|
||||
nlp = English()
|
||||
nlp.add_pipe("sentencizer")
|
||||
|
|
|
@ -12,7 +12,6 @@ from spacy.lang.en import English
|
|||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker
|
||||
from spacy.pipeline import EntityLinker, TrainablePipe
|
||||
from spacy.pipeline.legacy import EntityLinker_v1
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.scorer import Scorer
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
@ -354,6 +353,9 @@ def test_kb_default(nlp):
|
|||
"""Test that the default (empty) KB is loaded upon construction"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError, match="E139"):
|
||||
# this raises an error because the KB is empty
|
||||
entity_linker.validate_kb()
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
# 64 is the default value from pipeline.entity_linker
|
||||
|
@ -997,6 +999,8 @@ def test_scorer_links():
|
|||
)
|
||||
# fmt: on
|
||||
def test_legacy_architectures(name, config):
|
||||
from spacy_legacy.components.entity_linker import EntityLinker_v1
|
||||
|
||||
# Ensure that the legacy architectures still work
|
||||
vector_length = 3
|
||||
nlp = English()
|
||||
|
|
|
@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns):
|
|||
|
||||
def test_span_ruler_add_empty(patterns):
|
||||
"""Test that patterns don't get added excessively."""
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"validate": True})
|
||||
ruler.add_patterns(patterns)
|
||||
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
|
||||
|
@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_init(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
|
@ -74,7 +74,7 @@ def test_span_ruler_init(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_no_patterns_warns():
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
assert len(ruler) == 0
|
||||
assert len(ruler.labels) == 0
|
||||
|
@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns():
|
|||
|
||||
def test_span_ruler_init_patterns(patterns):
|
||||
# initialize with patterns
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
assert len(ruler.labels) == 0
|
||||
ruler.initialize(lambda: [], patterns=patterns)
|
||||
|
@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns):
|
|||
|
||||
def test_span_ruler_init_clear(patterns):
|
||||
"""Test that initialization clears patterns."""
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
|
@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_clear(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
|
@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_existing(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
||||
|
@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_existing_overwrite(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
||||
ruler.add_patterns(patterns)
|
||||
doc = nlp.make_doc("OH HELLO WORLD bye bye")
|
||||
|
@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_serialize_bytes(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
new_nlp = spacy.blank("xx")
|
||||
new_nlp = spacy.blank("mul")
|
||||
new_ruler = new_nlp.add_pipe("span_ruler")
|
||||
assert len(new_ruler) == 0
|
||||
assert len(new_ruler.labels) == 0
|
||||
|
@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_validate():
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
validated_ruler = nlp.add_pipe(
|
||||
"span_ruler", name="validated_span_ruler", config={"validate": True}
|
||||
|
@ -203,14 +203,14 @@ def test_span_ruler_validate():
|
|||
|
||||
|
||||
def test_span_ruler_properties(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
|
||||
ruler.add_patterns(patterns)
|
||||
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
|
||||
|
||||
|
||||
def test_span_ruler_overlapping_spans(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(overlapping_patterns)
|
||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||
|
@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_scorer(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(overlapping_patterns)
|
||||
text = "foo bar baz"
|
||||
|
@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process):
|
|||
|
||||
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
|
||||
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
|
||||
|
@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process):
|
|||
|
||||
|
||||
def test_span_ruler_serialize_dir(patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(patterns)
|
||||
with make_tempdir() as d:
|
||||
|
@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_basic(person_org_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_patterns)
|
||||
doc = ruler(nlp.make_doc("Dina went to school"))
|
||||
|
@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_patterns)
|
||||
assert len(ruler.patterns) == 3
|
||||
|
@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_several_patterns(person_org_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_patterns)
|
||||
doc = ruler(nlp.make_doc("Dina founded the company ACME."))
|
||||
|
@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_date_patterns)
|
||||
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
|
||||
|
@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
ruler.add_patterns(person_org_date_patterns)
|
||||
assert len(ruler.patterns) == 4
|
||||
|
@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_remove_and_add():
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler")
|
||||
patterns1 = [{"label": "DATE1", "pattern": "last time"}]
|
||||
ruler.add_patterns(patterns1)
|
||||
|
@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add():
|
|||
|
||||
|
||||
def test_span_ruler_spans_filter(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe(
|
||||
"span_ruler",
|
||||
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
|
||||
|
@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_ents_default_filter(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
|
||||
ruler.add_patterns(overlapping_patterns)
|
||||
doc = ruler(nlp.make_doc("foo bar baz"))
|
||||
|
@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
|
|||
|
||||
|
||||
def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe(
|
||||
"span_ruler",
|
||||
config={
|
||||
|
@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns):
|
|||
|
||||
return pass_through_filter
|
||||
|
||||
nlp = spacy.blank("xx")
|
||||
nlp = spacy.blank("mul")
|
||||
ruler = nlp.add_pipe(
|
||||
"span_ruler",
|
||||
config={
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import Callable
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Any, Dict
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import ensure_path, registry, load_model_from_config
|
||||
import srsly
|
||||
|
||||
from spacy import util, Errors
|
||||
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
|
@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
|
|||
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
||||
|
||||
[components.entity_linker.generate_empty_kb]
|
||||
@misc = "kb_test.CustomEmptyKB.v1"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.components]
|
||||
|
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
|
|||
[initialize.components.entity_linker]
|
||||
|
||||
[initialize.components.entity_linker.kb_loader]
|
||||
@misc = "spacy.CustomKB.v1"
|
||||
@misc = "kb_test.CustomKB.v1"
|
||||
entity_vector_length = 342
|
||||
custom_field = 666
|
||||
"""
|
||||
|
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
|
|||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.misc("spacy.CustomKB.v1")
|
||||
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True)
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def serialize_custom_fields(file_path: Path) -> None:
|
||||
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||
|
||||
serialize = {
|
||||
"contents": lambda p: self.write_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def deserialize_custom_fields(file_path: Path) -> None:
|
||||
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||
"contents": lambda p: self.read_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=0,
|
||||
)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
@registry.misc("kb_test.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||
def custom_kb_factory(vocab):
|
||||
kb = SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
|
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
# After IO, the KB is the standard one
|
||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||
assert entity_linker2.kb.custom_field == 666
|
||||
|
|
|
@ -181,7 +181,7 @@ def test_issue4042_bug2():
|
|||
@pytest.mark.issue(4725)
|
||||
def test_issue4725_1():
|
||||
"""Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
vocab = Vocab()
|
||||
nlp = English(vocab=vocab)
|
||||
config = {
|
||||
"update_with_oracle_cut_size": 111,
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import math
|
||||
from collections import Counter
|
||||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -1017,8 +1016,6 @@ def test_local_remote_storage_pull_missing():
|
|||
|
||||
|
||||
def test_cli_find_threshold(capsys):
|
||||
thresholds = numpy.linspace(0, 1, 10)
|
||||
|
||||
def make_examples(nlp: Language) -> List[Example]:
|
||||
docs: List[Example] = []
|
||||
|
||||
|
@ -1082,8 +1079,6 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
assert best_threshold != thresholds[0]
|
||||
assert thresholds[0] < best_threshold < thresholds[9]
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
|
@ -1091,7 +1086,7 @@ def test_cli_find_threshold(capsys):
|
|||
nlp, _ = init_nlp((("spancat", {}),))
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
best_threshold, best_score, res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="spancat",
|
||||
|
@ -1099,10 +1094,8 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="spans_sc_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[8]
|
||||
assert res[1] >= 0.6
|
||||
assert res[2][1.0] == 0.0
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
||||
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
||||
|
@ -1132,6 +1125,7 @@ def test_cli_find_threshold(capsys):
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"reqs,output",
|
||||
[
|
||||
|
@ -1164,6 +1158,8 @@ def test_cli_find_threshold(capsys):
|
|||
],
|
||||
)
|
||||
def test_project_check_requirements(reqs, output):
|
||||
import pkg_resources
|
||||
|
||||
# excessive guard against unlikely package name
|
||||
try:
|
||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
import srsly
|
||||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
|
@ -38,8 +40,8 @@ def test_benchmark_accuracy_alias():
|
|||
# Verify that the `evaluate` alias works correctly.
|
||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
||||
"spacy evaluate", "spacy benchmark accuracy"
|
||||
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
||||
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
||||
)
|
||||
|
||||
|
||||
|
@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
|||
# Instead of checking specific wording of the output, which may change,
|
||||
# we'll check that this section of the debug output is present.
|
||||
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||
|
||||
|
||||
# project tests
|
||||
|
||||
SAMPLE_PROJECT = {
|
||||
"title": "Sample project",
|
||||
"description": "This is a project for testing",
|
||||
"assets": [
|
||||
{
|
||||
"dest": "assets/spacy-readme.md",
|
||||
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
|
||||
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
|
||||
},
|
||||
{
|
||||
"dest": "assets/citation.cff",
|
||||
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
|
||||
"checksum": "c996bfd80202d480eb2e592369714e5e",
|
||||
"extra": True,
|
||||
},
|
||||
],
|
||||
"commands": [
|
||||
{
|
||||
"name": "ok",
|
||||
"help": "print ok",
|
||||
"script": ["python -c \"print('okokok')\""],
|
||||
},
|
||||
{
|
||||
"name": "create",
|
||||
"help": "make a file",
|
||||
"script": ["touch abc.txt"],
|
||||
"outputs": ["abc.txt"],
|
||||
},
|
||||
{
|
||||
"name": "clean",
|
||||
"help": "remove test file",
|
||||
"script": ["rm abc.txt"],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def project_dir():
|
||||
with make_tempdir() as pdir:
|
||||
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
|
||||
yield pdir
|
||||
|
||||
|
||||
def test_project_document(project_dir):
|
||||
readme_path = project_dir / "README.md"
|
||||
assert not readme_path.exists(), "README already exists"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert readme_path.is_file()
|
||||
text = readme_path.read_text("utf-8")
|
||||
assert SAMPLE_PROJECT["description"] in text
|
||||
|
||||
|
||||
def test_project_assets(project_dir):
|
||||
asset_dir = project_dir / "assets"
|
||||
assert not asset_dir.exists(), "Assets dir is already present"
|
||||
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
|
||||
# check that extras work
|
||||
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
|
||||
|
||||
|
||||
def test_project_run(project_dir):
|
||||
# make sure dry run works
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "run", "--dry", "create", str(project_dir)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert "okokok" in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
"",
|
||||
# "--sparse",
|
||||
"--branch v3",
|
||||
"--repo https://github.com/explosion/projects --branch v3",
|
||||
],
|
||||
)
|
||||
def test_project_clone(options):
|
||||
with make_tempdir() as workspace:
|
||||
out = workspace / "project"
|
||||
target = "benchmarks/ner_conll03"
|
||||
if not options:
|
||||
options = []
|
||||
else:
|
||||
options = options.split()
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "clone", target, *options, str(out)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (out / "README.md").is_file()
|
||||
|
||||
|
||||
def test_project_push_pull(project_dir):
|
||||
proj = dict(SAMPLE_PROJECT)
|
||||
remote = "xyz"
|
||||
|
||||
with make_tempdir() as remote_dir:
|
||||
proj["remotes"] = {remote: str(remote_dir)}
|
||||
proj_text = srsly.yaml_dumps(proj)
|
||||
(project_dir / "project.yml").write_text(proj_text)
|
||||
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.exists()
|
||||
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
|
|
|
@ -10,8 +10,9 @@ from spacy.training import Example
|
|||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.util import registry, ignore_error, raise_error, find_matching_language
|
||||
from spacy.util import load_model_from_config
|
||||
import spacy
|
||||
from thinc.api import CupyOps, NumpyOps, get_current_ops
|
||||
from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
|
||||
|
||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||
|
||||
|
@ -25,6 +26,51 @@ try:
|
|||
except ImportError:
|
||||
pass
|
||||
|
||||
TAGGER_CFG_STRING = """
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec","tagger"]
|
||||
|
||||
[components]
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
nO = null
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
rows = [2000, 1000, 1000, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[components.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
"""
|
||||
|
||||
|
||||
TAGGER_TRAIN_DATA = [
|
||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||
("Eat blue ham", {"tags": ["V", "J", "N"]}),
|
||||
]
|
||||
|
||||
|
||||
TAGGER_TRAIN_DATA = [
|
||||
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
|
||||
|
@ -52,7 +98,7 @@ def assert_sents_error(doc):
|
|||
|
||||
def warn_error(proc_name, proc, docs, e):
|
||||
logger = logging.getLogger("spacy")
|
||||
logger.warning(f"Trouble with component {proc_name}.")
|
||||
logger.warning("Trouble with component %s.", proc_name)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -91,6 +137,26 @@ def test_language_update(nlp):
|
|||
example = Example.from_dict(doc, wrongkeyannots)
|
||||
|
||||
|
||||
def test_language_update_updates():
|
||||
config = Config().from_str(TAGGER_CFG_STRING)
|
||||
nlp = load_model_from_config(config, auto_fill=True, validate=True)
|
||||
|
||||
train_examples = []
|
||||
for t in TAGGER_TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||
|
||||
xp = get_array_module(docs_after_update[0].tensor)
|
||||
assert xp.any(
|
||||
xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
|
||||
)
|
||||
|
||||
|
||||
def test_language_evaluate(nlp):
|
||||
text = "hello world"
|
||||
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
||||
|
@ -664,11 +730,12 @@ def test_spacy_blank():
|
|||
("fra", "fr"),
|
||||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("is", "isl"),
|
||||
("mo", "ro"),
|
||||
("mul", "xx"),
|
||||
("mul", "mul"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("xx", "mul"),
|
||||
("zh-Hans", "zh"),
|
||||
("zh-Hant", None),
|
||||
("zxx", None),
|
||||
|
@ -689,11 +756,11 @@ def test_language_matching(lang, target):
|
|||
("fra", "fr"),
|
||||
("fre", "fr"),
|
||||
("iw", "he"),
|
||||
("is", "isl"),
|
||||
("mo", "ro"),
|
||||
("mul", "xx"),
|
||||
("xx", "mul"),
|
||||
("no", "nb"),
|
||||
("pt-BR", "pt"),
|
||||
("xx", "xx"),
|
||||
("zh-Hans", "zh"),
|
||||
],
|
||||
)
|
||||
|
|
|
@ -36,6 +36,7 @@ LANGUAGES = [
|
|||
"hu",
|
||||
pytest.param("id", marks=pytest.mark.slow()),
|
||||
pytest.param("it", marks=pytest.mark.slow()),
|
||||
pytest.param("isl", marks=pytest.mark.slow()),
|
||||
pytest.param("kn", marks=pytest.mark.slow()),
|
||||
pytest.param("lb", marks=pytest.mark.slow()),
|
||||
pytest.param("lt", marks=pytest.mark.slow()),
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import numpy
|
||||
import tempfile
|
||||
import contextlib
|
||||
import re
|
||||
import srsly
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
|
|||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||
assert k1 == k2
|
||||
assert v1 == v2
|
||||
|
||||
|
||||
def normalize_whitespace(s):
|
||||
return re.sub(r"\s+", " ", s)
|
||||
|
|
|
@ -84,7 +84,7 @@ def test_issue1539():
|
|||
@pytest.mark.issue(1807)
|
||||
def test_issue1807():
|
||||
"""Test vocab.set_vector also adds the word to the vocab."""
|
||||
vocab = Vocab(vectors_name="test_issue1807")
|
||||
vocab = Vocab()
|
||||
assert "hello" not in vocab
|
||||
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
|
||||
assert "hello" in vocab
|
||||
|
@ -94,13 +94,12 @@ def test_issue1807():
|
|||
def test_issue2871():
|
||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||
words = ["dog", "cat", "SUFFIX"]
|
||||
vocab = Vocab(vectors_name="test_issue2871")
|
||||
vocab = Vocab()
|
||||
vocab.vectors.resize(shape=(3, 10))
|
||||
vector_data = numpy.zeros((3, 10), dtype="f")
|
||||
for word in words:
|
||||
_ = vocab[word] # noqa: F841
|
||||
vocab.set_vector(word, vector_data[0])
|
||||
vocab.vectors.name = "dummy_vectors"
|
||||
assert vocab["dog"].rank == 0
|
||||
assert vocab["cat"].rank == 1
|
||||
assert vocab["SUFFIX"].rank == 2
|
||||
|
@ -125,7 +124,7 @@ def test_issue4725_2():
|
|||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
|
||||
# or because of issues with pickling the NER (cf test_issue4725_1)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
vocab = Vocab()
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
|
@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
|
|||
|
||||
|
||||
def test_vocab_add_vector():
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
vocab = Vocab()
|
||||
data = OPS.xp.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
|
@ -356,7 +355,7 @@ def test_vocab_add_vector():
|
|||
|
||||
|
||||
def test_vocab_prune_vectors():
|
||||
vocab = Vocab(vectors_name="test_vocab_prune_vectors")
|
||||
vocab = Vocab()
|
||||
_ = vocab["cat"] # noqa: F841
|
||||
_ = vocab["dog"] # noqa: F841
|
||||
_ = vocab["kitten"] # noqa: F841
|
||||
|
@ -405,7 +404,7 @@ def test_vectors_serialize():
|
|||
|
||||
|
||||
def test_vector_is_oov():
|
||||
vocab = Vocab(vectors_name="test_vocab_is_oov")
|
||||
vocab = Vocab()
|
||||
data = OPS.xp.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
|
|
|
@ -105,9 +105,11 @@ class Doc:
|
|||
start_idx: int,
|
||||
end_idx: int,
|
||||
label: Union[int, str] = ...,
|
||||
*,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
||||
@property
|
||||
|
@ -126,12 +128,12 @@ class Doc:
|
|||
blocked: Optional[List[Span]] = ...,
|
||||
missing: Optional[List[Span]] = ...,
|
||||
outside: Optional[List[Span]] = ...,
|
||||
default: str = ...
|
||||
default: str = ...,
|
||||
) -> None: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
def noun_chunks(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def sents(self) -> Iterator[Span]: ...
|
||||
def sents(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def lang(self) -> int: ...
|
||||
@property
|
||||
|
|
|
@ -520,7 +520,7 @@ cdef class Doc:
|
|||
def doc(self):
|
||||
return self
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
|
||||
def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
|
||||
"""Create a `Span` object from the slice
|
||||
`doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
|
||||
created.
|
||||
|
@ -528,9 +528,9 @@ cdef class Doc:
|
|||
doc (Doc): The parent document.
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
|
||||
named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
|
@ -539,6 +539,7 @@ cdef class Doc:
|
|||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
|
@ -656,9 +657,6 @@ cdef class Doc:
|
|||
elif self.vocab.vectors.size > 0:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
elif self.tensor.size > 0:
|
||||
self._vector = self.tensor.mean(axis=0)
|
||||
return self._vector
|
||||
else:
|
||||
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
||||
|
||||
|
@ -705,10 +703,10 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property ents:
|
||||
"""The named entities in the document. Returns a tuple of named entity
|
||||
"""The named entities in the document. Returns a list of named entity
|
||||
`Span` objects, if the entity recognizer has been applied.
|
||||
|
||||
RETURNS (tuple): Entities in the document, one `Span` per entity.
|
||||
RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#ents
|
||||
"""
|
||||
|
@ -866,7 +864,7 @@ cdef class Doc:
|
|||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
RETURNS (Tuple[Span]): Noun chunks in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#noun_chunks
|
||||
"""
|
||||
|
@ -875,36 +873,35 @@ cdef class Doc:
|
|||
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenization from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
# its tokenization changing, so it's okay once we have the Span
|
||||
# objects. See Issue #375.
|
||||
# during the iteration.
|
||||
spans = []
|
||||
for start, end, label in self.noun_chunks_iterator(self):
|
||||
spans.append(Span(self, start, end, label=label))
|
||||
for span in spans:
|
||||
yield span
|
||||
return tuple(spans)
|
||||
|
||||
@property
|
||||
def sents(self):
|
||||
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||
objects. Sentence spans have no label.
|
||||
|
||||
YIELDS (Span): Sentences in the document.
|
||||
RETURNS (Tuple[Span]): Sentences in the document.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#sents
|
||||
"""
|
||||
if not self.has_annotation("SENT_START"):
|
||||
raise ValueError(Errors.E030)
|
||||
if "sents" in self.user_hooks:
|
||||
yield from self.user_hooks["sents"](self)
|
||||
return tuple(self.user_hooks["sents"](self))
|
||||
else:
|
||||
start = 0
|
||||
spans = []
|
||||
for i in range(1, self.length):
|
||||
if self.c[i].sent_start == 1:
|
||||
yield Span(self, start, i)
|
||||
spans.append(Span(self, start, i))
|
||||
start = i
|
||||
if start != self.length:
|
||||
yield Span(self, start, self.length)
|
||||
spans.append(Span(self, start, self.length))
|
||||
return tuple(spans)
|
||||
|
||||
@property
|
||||
def lang(self):
|
||||
|
@ -1604,7 +1601,7 @@ cdef class Doc:
|
|||
for span_group in doc_json.get("spans", {}):
|
||||
spans = []
|
||||
for span in doc_json["spans"][span_group]:
|
||||
char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"])
|
||||
char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"])
|
||||
if char_span is None:
|
||||
raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"]))
|
||||
spans.append(char_span)
|
||||
|
|
|
@ -74,6 +74,8 @@ class Span:
|
|||
@property
|
||||
def ents(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def sents(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def has_vector(self) -> bool: ...
|
||||
@property
|
||||
def vector(self) -> Floats1d: ...
|
||||
|
@ -86,7 +88,7 @@ class Span:
|
|||
@property
|
||||
def text_with_ws(self) -> str: ...
|
||||
@property
|
||||
def noun_chunks(self) -> Iterator[Span]: ...
|
||||
def noun_chunks(self) -> Tuple[Span]: ...
|
||||
@property
|
||||
def root(self) -> Token: ...
|
||||
def char_span(
|
||||
|
@ -94,8 +96,11 @@ class Span:
|
|||
start_idx: int,
|
||||
end_idx: int,
|
||||
label: Union[int, str] = ...,
|
||||
*,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
@property
|
||||
def conjuncts(self) -> Tuple[Token]: ...
|
||||
|
|
|
@ -134,10 +134,8 @@ cdef class Span:
|
|||
else:
|
||||
return True
|
||||
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
cdef SpanC* other_span_c = other.span_c()
|
||||
self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc)
|
||||
other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc)
|
||||
self_tuple = self._cmp_tuple()
|
||||
other_tuple = other._cmp_tuple()
|
||||
# <
|
||||
if op == 0:
|
||||
return self_tuple < other_tuple
|
||||
|
@ -158,8 +156,20 @@ cdef class Span:
|
|||
return self_tuple >= other_tuple
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._cmp_tuple())
|
||||
|
||||
def _cmp_tuple(self):
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id))
|
||||
return (
|
||||
span_c.start_char,
|
||||
span_c.end_char,
|
||||
span_c.start,
|
||||
span_c.end,
|
||||
span_c.label,
|
||||
span_c.kb_id,
|
||||
span_c.id,
|
||||
self.doc,
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of tokens in the span.
|
||||
|
@ -382,7 +392,7 @@ cdef class Span:
|
|||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
|
@ -451,20 +461,21 @@ cdef class Span:
|
|||
"""Obtain the sentences that contain this span. If the given span
|
||||
crosses sentence boundaries, return all sentences it is a part of.
|
||||
|
||||
RETURNS (Iterable[Span]): All sentences that the span is a part of.
|
||||
RETURNS (Tuple[Span]): All sentences that the span is a part of.
|
||||
|
||||
DOCS: https://spacy.io/api/span#sents
|
||||
DOCS: https://spacy.io/api/span#sents
|
||||
"""
|
||||
cdef int start
|
||||
cdef int i
|
||||
|
||||
if "sents" in self.doc.user_span_hooks:
|
||||
yield from self.doc.user_span_hooks["sents"](self)
|
||||
elif "sents" in self.doc.user_hooks:
|
||||
return tuple(self.doc.user_span_hooks["sents"](self))
|
||||
spans = []
|
||||
if "sents" in self.doc.user_hooks:
|
||||
for sentence in self.doc.user_hooks["sents"](self.doc):
|
||||
if sentence.end > self.start:
|
||||
if sentence.start < self.end or sentence.start == self.start == self.end:
|
||||
yield sentence
|
||||
spans.append(sentence)
|
||||
else:
|
||||
break
|
||||
else:
|
||||
|
@ -479,12 +490,13 @@ cdef class Span:
|
|||
# Now, find all the sentences in the span
|
||||
for i in range(start + 1, self.doc.length):
|
||||
if self.doc.c[i].sent_start == 1:
|
||||
yield Span(self.doc, start, i)
|
||||
spans.append(Span(self.doc, start, i))
|
||||
start = i
|
||||
if start >= self.end:
|
||||
break
|
||||
if start < self.end:
|
||||
yield Span(self.doc, start, self.end)
|
||||
spans.append(Span(self.doc, start, self.end))
|
||||
return tuple(spans)
|
||||
|
||||
|
||||
@property
|
||||
|
@ -492,7 +504,7 @@ cdef class Span:
|
|||
"""The named entities that fall completely within the span. Returns
|
||||
a tuple of `Span` objects.
|
||||
|
||||
RETURNS (tuple): Entities in the span, one `Span` per entity.
|
||||
RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
|
||||
|
||||
DOCS: https://spacy.io/api/span#ents
|
||||
"""
|
||||
|
@ -507,7 +519,7 @@ cdef class Span:
|
|||
ents.append(ent)
|
||||
else:
|
||||
break
|
||||
return ents
|
||||
return tuple(ents)
|
||||
|
||||
@property
|
||||
def has_vector(self):
|
||||
|
@ -522,8 +534,6 @@ cdef class Span:
|
|||
return self.doc.user_span_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.size > 0:
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.doc.tensor.size > 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
@ -605,13 +615,15 @@ cdef class Span:
|
|||
NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the span.
|
||||
RETURNS (Tuple[Span]): Noun chunks in the span.
|
||||
|
||||
DOCS: https://spacy.io/api/span#noun_chunks
|
||||
"""
|
||||
spans = []
|
||||
for span in self.doc.noun_chunks:
|
||||
if span.start >= self.start and span.end <= self.end:
|
||||
yield span
|
||||
spans.append(span)
|
||||
return tuple(spans)
|
||||
|
||||
@property
|
||||
def root(self):
|
||||
|
@ -656,22 +668,28 @@ cdef class Span:
|
|||
else:
|
||||
return self.doc[root]
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
|
||||
def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0):
|
||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
alignment_mode (str): How character indices are aligned to token
|
||||
boundaries. Options: "strict" (character indices must be aligned
|
||||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
cdef SpanC* span_c = self.span_c()
|
||||
start_idx += span_c.start_char
|
||||
end_idx += span_c.start_char
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
|
|
|
@ -389,8 +389,6 @@ cdef class Token:
|
|||
"""
|
||||
if "has_vector" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["has_vector"](self)
|
||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||
return True
|
||||
return self.vocab.has_vector(self.c.lex.orth)
|
||||
|
||||
@property
|
||||
|
@ -404,8 +402,6 @@ cdef class Token:
|
|||
"""
|
||||
if "vector" in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks["vector"](self)
|
||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||
return self.doc.tensor[self.i]
|
||||
else:
|
||||
return self.vocab.get_vector(self.c.lex.orth)
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ def create_copy_from_base_model(
|
|||
) -> Callable[[Language], Language]:
|
||||
def copy_from_base_model(nlp):
|
||||
if tokenizer:
|
||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
||||
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||
base_nlp = load_model(tokenizer)
|
||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||
|
@ -23,7 +23,7 @@ def create_copy_from_base_model(
|
|||
)
|
||||
)
|
||||
if vocab:
|
||||
logger.info(f"Copying vocab from: {vocab}")
|
||||
logger.info("Copying vocab from: %s", vocab)
|
||||
# only reload if the vocab is from a different model
|
||||
if tokenizer != vocab:
|
||||
base_nlp = load_model(vocab)
|
||||
|
|
|
@ -86,7 +86,7 @@ def conll_ner_to_docs(
|
|||
if model:
|
||||
nlp = load_model(model)
|
||||
else:
|
||||
nlp = get_lang_class("xx")()
|
||||
nlp = get_lang_class("mul")()
|
||||
for conll_doc in input_data.strip().split(doc_delimiter):
|
||||
conll_doc = conll_doc.strip()
|
||||
if not conll_doc:
|
||||
|
@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
||||
"improved parser-based sentence segmentation.)"
|
||||
)
|
||||
nlp = get_lang_class("xx")()
|
||||
nlp = get_lang_class("mul")()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
lines = doc.strip().split("\n")
|
||||
words = [line.strip().split()[0] for line in lines]
|
||||
|
|
|
@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations
|
|||
from ..example import annotations_to_doc
|
||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||
from ...util import load_model
|
||||
from ...lang.xx import MultiLanguage
|
||||
from ...lang.mul import MultiLanguage
|
||||
|
||||
|
||||
def json_to_docs(input_data, model=None, **kwargs):
|
||||
|
|
|
@ -29,7 +29,7 @@ def create_docbin_reader(
|
|||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
util.logger.debug(f"Loading corpus from path: {path}")
|
||||
util.logger.debug("Loading corpus from path: %s", path)
|
||||
return Corpus(
|
||||
path,
|
||||
gold_preproc=gold_preproc,
|
||||
|
|
|
@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
frozen_components = T["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced if p not in frozen_components]
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
logger.info("Pipeline: %s", nlp.pipe_names)
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
logger.info(f"Resuming training for: {resume_components}")
|
||||
logger.info("Resuming training for: %s", resume_components)
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
# Make sure that listeners are defined before initializing further
|
||||
nlp._link_components()
|
||||
|
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
if T["max_epochs"] == -1:
|
||||
sample_size = 100
|
||||
logger.debug(
|
||||
f"Due to streamed train corpus, using only first {sample_size} "
|
||||
f"examples for initialization. If necessary, provide all labels "
|
||||
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
||||
"Due to streamed train corpus, using only first %s examples for initialization. "
|
||||
"If necessary, provide all labels in [initialize]. "
|
||||
"More info: https://spacy.io/api/cli#init_labels",
|
||||
sample_size,
|
||||
)
|
||||
nlp.initialize(
|
||||
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
||||
)
|
||||
else:
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
|
||||
# Detect components with listeners that are not frozen consistently
|
||||
for name, proc in nlp.pipeline:
|
||||
for listener in getattr(
|
||||
|
@ -109,7 +110,7 @@ def init_vocab(
|
|||
) -> None:
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
|
||||
data_path = ensure_path(data)
|
||||
if data_path is not None:
|
||||
lex_attrs = srsly.read_jsonl(data_path)
|
||||
|
@ -125,11 +126,11 @@ def init_vocab(
|
|||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
|
||||
logger.info("Created vocabulary")
|
||||
if vectors is not None:
|
||||
load_vectors_into_model(nlp, vectors)
|
||||
logger.info(f"Added vectors: {vectors}")
|
||||
logger.info("Added vectors: %s", vectors)
|
||||
# warn if source model vectors are not identical
|
||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
|
@ -191,7 +192,7 @@ def init_tok2vec(
|
|||
if weights_data is not None:
|
||||
layer = get_tok2vec_ref(nlp, P)
|
||||
layer.from_bytes(weights_data)
|
||||
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
||||
logger.info("Loaded pretrained weights from %s", init_tok2vec)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -202,7 +203,6 @@ def convert_vectors(
|
|||
*,
|
||||
truncate: int,
|
||||
prune: int,
|
||||
name: Optional[str] = None,
|
||||
mode: str = VectorsMode.default,
|
||||
) -> None:
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
|
@ -216,13 +216,13 @@ def convert_vectors(
|
|||
nlp.vocab.deduplicate_vectors()
|
||||
else:
|
||||
if vectors_loc:
|
||||
logger.info(f"Reading vectors from {vectors_loc}")
|
||||
logger.info("Reading vectors from %s", vectors_loc)
|
||||
vectors_data, vector_keys, floret_settings = read_vectors(
|
||||
vectors_loc,
|
||||
truncate,
|
||||
mode=mode,
|
||||
)
|
||||
logger.info(f"Loaded vectors from {vectors_loc}")
|
||||
logger.info("Loaded vectors from %s", vectors_loc)
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None and mode != VectorsMode.floret:
|
||||
|
@ -241,12 +241,6 @@ def convert_vectors(
|
|||
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
||||
)
|
||||
nlp.vocab.deduplicate_vectors()
|
||||
if name is None:
|
||||
# TODO: Is this correct? Does this matter?
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
if prune >= 1 and mode != VectorsMode.floret:
|
||||
nlp.vocab.prune_vectors(prune)
|
||||
|
||||
|
|
|
@ -210,7 +210,7 @@ def train_while_improving(
|
|||
subbatch,
|
||||
drop=dropout,
|
||||
losses=losses,
|
||||
sgd=False, # type: ignore[arg-type]
|
||||
sgd=None,
|
||||
exclude=exclude,
|
||||
annotates=annotating_components,
|
||||
)
|
||||
|
@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
|
|||
if subdir.exists():
|
||||
try:
|
||||
shutil.rmtree(str(subdir))
|
||||
logger.debug(f"Removed existing output directory: {subdir}")
|
||||
logger.debug("Removed existing output directory: %s", subdir)
|
||||
except Exception as e:
|
||||
raise IOError(Errors.E901.format(path=path)) from e
|
||||
|
|
|
@ -33,6 +33,7 @@ import inspect
|
|||
import pkgutil
|
||||
import logging
|
||||
import socket
|
||||
import stat
|
||||
|
||||
try:
|
||||
import cupy.random
|
||||
|
@ -55,7 +56,7 @@ if TYPE_CHECKING:
|
|||
# fmt: off
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
DEFAULT_OOV_PROB = -20
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config file. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
|
@ -139,8 +140,17 @@ class registry(thinc.registry):
|
|||
return func
|
||||
|
||||
@classmethod
|
||||
def find(cls, registry_name: str, func_name: str) -> Callable:
|
||||
"""Get info about a registered function from the registry."""
|
||||
def find(
|
||||
cls, registry_name: str, func_name: str
|
||||
) -> Dict[str, Optional[Union[str, int]]]:
|
||||
"""Find information about a registered function, including the
|
||||
module and path to the file it's defined in, the line number and the
|
||||
docstring, if available.
|
||||
|
||||
registry_name (str): Name of the catalogue registry.
|
||||
func_name (str): Name of the registered function.
|
||||
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
|
||||
"""
|
||||
# We're overwriting this classmethod so we're able to provide more
|
||||
# specific error messages and implement a fallback to spacy-legacy.
|
||||
if not hasattr(cls, registry_name):
|
||||
|
@ -283,7 +293,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
|||
import spacy.lang # noqa: F401
|
||||
|
||||
if lang == "xx":
|
||||
return "xx"
|
||||
return "mul"
|
||||
|
||||
# Find out which language modules we have
|
||||
possible_languages = []
|
||||
|
@ -301,11 +311,7 @@ def find_matching_language(lang: str) -> Optional[str]:
|
|||
# is labeled that way is probably trying to be distinct from 'zh' and
|
||||
# shouldn't automatically match.
|
||||
match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9)
|
||||
if match == "mul":
|
||||
# Convert 'mul' back to spaCy's 'xx'
|
||||
return "xx"
|
||||
else:
|
||||
return match
|
||||
return match
|
||||
|
||||
|
||||
def get_lang_class(lang: str) -> Type["Language"]:
|
||||
|
@ -1034,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]:
|
|||
"""
|
||||
d = Path(tempfile.mkdtemp())
|
||||
yield d
|
||||
|
||||
# On Windows, git clones use read-only files, which cause permission errors
|
||||
# when being deleted. This forcibly fixes permissions.
|
||||
def force_remove(rmfunc, path, ex):
|
||||
os.chmod(path, stat.S_IWRITE)
|
||||
rmfunc(path)
|
||||
|
||||
try:
|
||||
shutil.rmtree(str(d))
|
||||
shutil.rmtree(str(d), onerror=force_remove)
|
||||
except PermissionError as e:
|
||||
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
||||
|
||||
|
|
|
@ -52,7 +52,6 @@ cdef class Vectors:
|
|||
DOCS: https://spacy.io/api/vectors
|
||||
"""
|
||||
cdef public object strings
|
||||
cdef public object name
|
||||
cdef readonly object mode
|
||||
cdef public object data
|
||||
cdef public object key2row
|
||||
|
@ -64,14 +63,13 @@ cdef class Vectors:
|
|||
cdef readonly unicode bow
|
||||
cdef readonly unicode eow
|
||||
|
||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
||||
"""Create a new vector store.
|
||||
|
||||
strings (StringStore): The string store.
|
||||
shape (tuple): Size of the table, as (# entries, # columns)
|
||||
data (numpy.ndarray or cupy.ndarray): The vector data.
|
||||
keys (iterable): A sequence of keys, aligned with the data.
|
||||
name (str): A name to identify the vectors table.
|
||||
mode (str): Vectors mode: "default" or "floret" (default: "default").
|
||||
minn (int): The floret char ngram minn (default: 0).
|
||||
maxn (int): The floret char ngram maxn (default: 0).
|
||||
|
@ -85,7 +83,6 @@ cdef class Vectors:
|
|||
self.strings = strings
|
||||
if self.strings is None:
|
||||
self.strings = StringStore()
|
||||
self.name = name
|
||||
if mode not in Mode.values():
|
||||
raise ValueError(
|
||||
Errors.E202.format(
|
||||
|
|
|
@ -11,7 +11,8 @@ from .vectors import Vectors
|
|||
from pathlib import Path
|
||||
|
||||
def create_vocab(
|
||||
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ...
|
||||
lang: Optional[str],
|
||||
defaults: Any,
|
||||
) -> Vocab: ...
|
||||
|
||||
class Vocab:
|
||||
|
@ -28,7 +29,6 @@ class Vocab:
|
|||
strings: Optional[Union[List[str], StringStore]] = ...,
|
||||
lookups: Optional[Lookups] = ...,
|
||||
oov_prob: float = ...,
|
||||
vectors_name: Optional[str] = ...,
|
||||
writing_system: Dict[str, Any] = ...,
|
||||
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
|
||||
) -> None: ...
|
||||
|
|
|
@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS
|
|||
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
|
||||
|
||||
|
||||
def create_vocab(lang, defaults, vectors_name=None):
|
||||
def create_vocab(lang, defaults):
|
||||
# If the spacy-lookups-data package is installed, we pre-populate the lookups
|
||||
# with lexeme data, if available
|
||||
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
|
||||
|
@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None):
|
|||
lex_attr_getters=lex_attrs,
|
||||
writing_system=defaults.writing_system,
|
||||
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
|
||||
vectors_name=vectors_name,
|
||||
)
|
||||
|
||||
|
||||
|
@ -51,8 +50,8 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab
|
||||
"""
|
||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
||||
oov_prob=-20., vectors_name=None, writing_system={},
|
||||
get_noun_chunks=None, **deprecated_kwargs):
|
||||
oov_prob=-20., writing_system={}, get_noun_chunks=None,
|
||||
**deprecated_kwargs):
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||
|
@ -61,7 +60,6 @@ cdef class Vocab:
|
|||
vice versa.
|
||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||
oov_prob (float): Default OOV probability.
|
||||
vectors_name (str): Optional name to identify the vectors table.
|
||||
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
|
||||
A function that yields base noun phrases used for Doc.noun_chunks.
|
||||
"""
|
||||
|
@ -78,7 +76,7 @@ cdef class Vocab:
|
|||
_ = self[string]
|
||||
self.lex_attr_getters = lex_attr_getters
|
||||
self.morphology = Morphology(self.strings)
|
||||
self.vectors = Vectors(strings=self.strings, name=vectors_name)
|
||||
self.vectors = Vectors(strings=self.strings)
|
||||
self.lookups = lookups
|
||||
self.writing_system = writing_system
|
||||
self.get_noun_chunks = get_noun_chunks
|
||||
|
@ -308,7 +306,7 @@ cdef class Vocab:
|
|||
for key, row in self.vectors.key2row.items()
|
||||
}
|
||||
# replace vectors with deduplicated version
|
||||
self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name)
|
||||
self.vectors = Vectors(strings=self.strings, data=data)
|
||||
for key, row in key2row.items():
|
||||
self.vectors.add(key, row=row)
|
||||
|
||||
|
@ -358,7 +356,7 @@ cdef class Vocab:
|
|||
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
||||
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
|
||||
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
|
||||
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name)
|
||||
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
|
||||
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
|
||||
syn_keys = ops.to_numpy(syn_keys)
|
||||
remap = {}
|
||||
|
|
|
@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
|||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created.
|
||||
instance.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||
|
||||
### spacy.EmptyKB.v2 {id="EmptyKB"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created. It
|
||||
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
|
||||
|
||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
||||
|
||||
A function that reads an existing `KnowledgeBase` from file.
|
||||
|
@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
|||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
|
||||
the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
|
||||
### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
|
||||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
|
||||
[`Span`](/api/span) objects denoting named entities, and returns a list of
|
||||
plausible [`Candidate`](/api/kb/#candidate) objects per specified
|
||||
[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
|
||||
mention to find its potential aliases in the `KnowledgeBase`. Note that this
|
||||
function is case-dependent.
|
||||
|
||||
## Coreference {id="coref-architectures",tag="experimental"}
|
||||
|
||||
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
|
||||
|
|
|
@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`.
|
|||
</Infobox>
|
||||
|
||||
```bash
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose]
|
||||
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
|||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
|
||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |
|
||||
|
@ -1410,12 +1409,13 @@ $ python -m spacy project assets [project_dir]
|
|||
> $ python -m spacy project assets [--sparse]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||
| Name | Description |
|
||||
| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ |
|
||||
| `--extra`, `-e` <Tag variant="new">3.3.1</Tag> | Download assets marked as "extra". Default false. ~~bool (flag)~~ |
|
||||
| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | Downloaded or copied assets defined in the `project.yml`. |
|
||||
|
||||
### project run {id="project-run",tag="command"}
|
||||
|
||||
|
@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
|
|||
### project pull {id="project-pull",tag="command"}
|
||||
|
||||
Download all files or directories listed as `outputs` for commands, unless they
|
||||
are not already present locally. When searching for files in the remote, `pull`
|
||||
are already present locally. When searching for files in the remote, `pull`
|
||||
won't just look at the output path, but will also consider the **command
|
||||
string** and the **hashes of the dependencies**. For instance, let's say you've
|
||||
previously pushed a checkpoint to the remote, but now you've changed some
|
||||
|
|
|
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||
| `tags` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `pos` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `morphs` <Tag variant="new">3</Tag> | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
|
@ -209,15 +209,17 @@ alignment mode `"strict".
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.set_ents {id="set_ents",tag="method",version="3"}
|
||||
|
||||
|
@ -652,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
|
|||
|
||||
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
|
||||
|
||||
Iterate over the base noun phrases in the document. Yields base noun-phrase
|
||||
`Span` objects, if the document has been syntactically parsed. A base noun
|
||||
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
|
||||
nested within it – so no NP-level coordination, no prepositional phrases, and no
|
||||
relative clauses.
|
||||
Returns a tuple of the base noun phrases in the doc, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
|
||||
does not permit other NPs to be nested within it – so no NP-level coordination,
|
||||
no prepositional phrases, and no relative clauses.
|
||||
|
||||
To customize the noun chunk iterator in a loaded pipeline, modify
|
||||
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
|
||||
|
@ -673,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised.
|
|||
> assert chunks[1].text == "another phrase"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------- |
|
||||
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------- |
|
||||
| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
|
||||
|
||||
## Doc.sents {id="sents",tag="property",model="sentences"}
|
||||
|
||||
Iterate over the sentences in the document. Sentence spans have no label.
|
||||
Returns a tuple of the sentences in the document. Sentence spans have no label.
|
||||
|
||||
This property is only available when
|
||||
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
|
||||
|
@ -695,9 +696,9 @@ will raise an error otherwise.
|
|||
> assert [s.root.text for s in sents] == ["is", "'s"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ----------------------------------- |
|
||||
| **YIELDS** | Sentences in the document. ~~Span~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------ |
|
||||
| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
|
||||
|
||||
## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
|
||||
|
||||
|
|
|
@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||
|
|
|
@ -45,7 +45,7 @@ architectures and their arguments and hyperparameters.
|
|||
| Setting | Description |
|
||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ |
|
||||
| `extend` <Tag variant="new">3.2</Tag> | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
|
||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ |
|
||||
|
|
|
@ -30,7 +30,7 @@ Create a new `Scorer`.
|
|||
| Name | Description |
|
||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
|
||||
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
|
||||
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ |
|
||||
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
|
||||
|
|
|
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start_idx` | The index of the first character of the span. ~~int~~ |
|
||||
| `end_idx` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Span.similarity {id="similarity",tag="method",model="vectors"}
|
||||
|
||||
|
@ -272,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of
|
|||
> assert ents[0].text == "Mr. Best"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------- |
|
||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------ |
|
||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
|
||||
|
||||
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
|
||||
|
||||
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
|
||||
objects, if the document has been syntactically parsed. A base noun phrase, or
|
||||
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
|
||||
it – so no NP-level coordination, no prepositional phrases, and no relative
|
||||
clauses.
|
||||
Returns a tuple of the base noun phrases in the span if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
|
||||
does not permit other NPs to be nested within it – so no NP-level coordination,
|
||||
no prepositional phrases, and no relative clauses.
|
||||
|
||||
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
||||
has not been implemeted for the given language, a `NotImplementedError` is
|
||||
|
@ -298,9 +300,9 @@ raised.
|
|||
> assert chunks[0].text == "another phrase"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------- |
|
||||
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------- |
|
||||
| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
|
||||
|
||||
## Span.as_doc {id="as_doc",tag="method"}
|
||||
|
||||
|
@ -522,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
|
|||
|
||||
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
|
||||
|
||||
Returns a generator over the sentences the span belongs to. This property is
|
||||
only available when [sentence boundaries](/usage/linguistic-features#sbd) have
|
||||
been set on the document by the `parser`, `senter`, `sentencizer` or some custom
|
||||
Returns a tuple of the sentences the span belongs to. This property is only
|
||||
available when [sentence boundaries](/usage/linguistic-features#sbd) have been
|
||||
set on the document by the `parser`, `senter`, `sentencizer` or some custom
|
||||
function. It will raise an error otherwise.
|
||||
|
||||
If the span happens to cross sentence boundaries, all sentences the span
|
||||
|
@ -538,9 +540,9 @@ overlaps with will be returned.
|
|||
> assert len(span.sents) == 2
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------- |
|
||||
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
|
||||
|
||||
## Attributes {id="attributes"}
|
||||
|
||||
|
|
|
@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="dep", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
|
||||
#### Named Entity Visualizer options {id="displacy_options-ent"}
|
||||
|
||||
|
|
|
@ -52,7 +52,6 @@ modified later.
|
|||
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
|
||||
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
|
||||
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
||||
| `name` | A name to identify the vectors table. ~~str~~ |
|
||||
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
|
||||
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
|
||||
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |
|
||||
|
|
|
@ -27,7 +27,6 @@ Create the vocabulary.
|
|||
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
|
||||
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
||||
| `vectors_name` | A name to identify the vectors table. ~~str~~ |
|
||||
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
|
||||
|
||||
|
|
|
@ -21,8 +21,8 @@ menu:
|
|||
## Package naming conventions {id="conventions"}
|
||||
|
||||
In general, spaCy expects all pipeline packages to follow the naming convention
|
||||
of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name
|
||||
into three components:
|
||||
of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into
|
||||
three components:
|
||||
|
||||
1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with
|
||||
tagging, parsing, lemmatization and named entity recognition, or `dep` for
|
||||
|
|
|
@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
|||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
To make them compact and fast, spaCy's small [pipeline packages](/models) (all
|
||||
packages that end in `sm`) **don't ship with word vectors**, and only include
|
||||
context-sensitive **tensors**. This means you can still use the `similarity()`
|
||||
methods to compare documents, spans and tokens – but the result won't be as
|
||||
good, and individual tokens won't have any vectors assigned. So in order to use
|
||||
_real_ word vectors, you need to download a larger pipeline package:
|
||||
packages that end in `sm`) **don't ship with word vectors**. In order to use
|
||||
`similarity()`, you need to download a larger pipeline package that includes
|
||||
vectors:
|
||||
|
||||
```diff
|
||||
- python -m spacy download en_core_web_sm
|
||||
+ python -m spacy download en_core_web_lg
|
||||
+ python -m spacy download en_core_web_md
|
||||
```
|
||||
|
||||
In spaCy v3 and earlier, small pipeline packages supported `similarity()` by
|
||||
backing off to context-sensitive tensors from the `tok2vec` component. These
|
||||
tensors do not work well for this purpose and this backoff has been removed in
|
||||
spaCy v4.
|
||||
|
||||
</Infobox>
|
||||
|
||||
Pipeline packages that come with built-in word vectors make them available as
|
||||
|
|
|
@ -74,23 +74,23 @@ your data.
|
|||
|
||||
> ```python
|
||||
> # Standard import
|
||||
> from spacy.lang.xx import MultiLanguage
|
||||
> from spacy.lang.mul import MultiLanguage
|
||||
> nlp = MultiLanguage()
|
||||
>
|
||||
> # With lazy-loading
|
||||
> nlp = spacy.blank("xx")
|
||||
> nlp = spacy.blank("mul")
|
||||
> ```
|
||||
|
||||
spaCy also supports pipelines trained on more than one language. This is
|
||||
especially useful for named entity recognition. The language ID used for
|
||||
multi-language or language-neutral pipelines is `xx`. The language class, a
|
||||
multi-language or language-neutral pipelines is `mul`. The language class, a
|
||||
generic subclass containing only the base language data, can be found in
|
||||
[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx).
|
||||
[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul).
|
||||
|
||||
To train a pipeline using the neutral multi-language class, you can set
|
||||
`lang = "xx"` in your [training config](/usage/training#config). You can also
|
||||
`lang = "mul"` in your [training config](/usage/training#config). You can also
|
||||
\import the `MultiLanguage` class directly, or call
|
||||
[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||
[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading.
|
||||
|
||||
### Chinese language support {id="chinese",version="2.3"}
|
||||
|
||||
|
|
|
@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
|
||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||
|
||||
|
@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
|
|||
|
||||
The saved pipeline now includes the `"entity_ruler"` in its
|
||||
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
|
||||
file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
|
||||
all pipeline components will be restored and deserialized – including the entity
|
||||
file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
|
||||
pipeline components will be restored and deserialized – including the entity
|
||||
ruler. This lets you ship powerful pipeline packages with binary weights _and_
|
||||
rules included!
|
||||
|
||||
|
|
|
@ -155,6 +155,21 @@ An error is now raised when unsupported values are given as input to train a
|
|||
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
|
||||
as explained in the [docs](/api/textcategorizer#assigned-attributes).
|
||||
|
||||
### Using the default knowledge base
|
||||
|
||||
As `KnowledgeBase` is now an abstract class, you should call the constructor of
|
||||
the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
|
||||
implementation:
|
||||
|
||||
```diff
|
||||
- kb = KnowledgeBase()
|
||||
+ kb = InMemoryLookupKB()
|
||||
```
|
||||
|
||||
If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
|
||||
implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
|
||||
instead.
|
||||
|
||||
### Updated scorers for tokenization and textcat {id="scores"}
|
||||
|
||||
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
|
||||
|
|
|
@ -58,12 +58,12 @@ arcs.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Argument | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| Argument | Description |
|
||||
| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
|
||||
For a list of all available options, see the
|
||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||
|
|
|
@ -165,7 +165,7 @@
|
|||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "is",
|
||||
"code": "isl",
|
||||
"name": "Icelandic"
|
||||
},
|
||||
{
|
||||
|
@ -434,9 +434,9 @@
|
|||
]
|
||||
},
|
||||
{
|
||||
"code": "xx",
|
||||
"code": "mul",
|
||||
"name": "Multi-language",
|
||||
"models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
|
||||
"models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"],
|
||||
"example": "This is a sentence about Facebook."
|
||||
},
|
||||
{
|
||||
|
|
|
@ -103,7 +103,7 @@ const QuickstartInstall = ({ id, title, description, children }) => {
|
|||
</QS>
|
||||
<QS config="example" prompt="python">
|
||||
print([
|
||||
{code === 'xx'
|
||||
{code === 'mul'
|
||||
? '(ent.text, ent.label) for ent in doc.ents'
|
||||
: '(w.text, w.pos_) for w in doc'}
|
||||
])
|
||||
|
|
Loading…
Reference in New Issue
Block a user