diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 7c3c3e0a6..d1154756c 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -69,6 +69,11 @@ steps: # displayName: 'Test skip re-download (#12188)' # condition: eq(variables['python_version'], '3.8') +# - script: | +# python -W error -m spacy info ca_core_news_sm | grep -q download_url +# displayName: 'Test download_url in info CLI' +# condition: eq(variables['python_version'] '3.8') + - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . displayName: 'Test convert CLI' diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 70882c3cc..555322782 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -16,7 +16,7 @@ jobs: with: ref: ${{ github.head_ref }} - uses: actions/setup-python@v4 - - run: pip install black + - run: pip install black -c requirements.txt - name: Auto-format code if needed run: black spacy # We can't run black --check here because that returns a non-zero excit diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1a7c0c9a4..3c0b27c1d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its Python modules. If you've built spaCy from source, you'll already have both tools installed. +As a general rule of thumb, we use f-strings for any formatting of strings. +One exception are calls to Python's `logging` functionality. +To avoid unnecessary string conversions in these cases, we use string formatting +templates with `%s` and `%d` etc. + **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a6a575315..9b7ebbe01 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -41,7 +41,7 @@ jobs: inputs: versionSpec: "3.8" - script: | - pip install black==22.3.0 + pip install black -c requirements.txt python -m black spacy --check displayName: "black" - script: | diff --git a/requirements.txt b/requirements.txt index 78cccfbf1..6f4b61918 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.990,<0.1000; platform_machine != "aarch64" +mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" +types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=22.0,<23.0 +black==22.3.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 536b263a0..e91670879 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -90,9 +90,9 @@ def parse_config_overrides( cli_overrides = _parse_overrides(args, is_cli=True) if cli_overrides: keys = [k for k in cli_overrides if k not in env_overrides] - logger.debug(f"Config overrides from CLI: {keys}") + logger.debug("Config overrides from CLI: %s", keys) if env_overrides: - logger.debug(f"Config overrides from env variables: {list(env_overrides)}") + logger.debug("Config overrides from env variables: %s", list(env_overrides)) return {**cli_overrides, **env_overrides} diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..23b69a81d 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,10 +1,10 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer import srsly +import importlib.metadata from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib.metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index f279cf793..5d5c14957 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -21,7 +21,6 @@ def init_vectors_cli( prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), - name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), # fmt: on @@ -44,7 +43,6 @@ def init_vectors_cli( vectors_loc, truncate=truncate, prune=prune, - name=name, mode=mode, ) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 056b1c2e6..cb0177b5d 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -252,7 +252,7 @@ def get_third_party_dependencies( raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file - modules.add(func_info["module"].split(".")[0]) # type: ignore[index] + modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr] dependencies = [] for module_name in modules: if module_name in distributions: diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 6e3cde88c..8894baa50 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # in the list. while commands: for i, cmd in enumerate(list(commands)): - logger.debug(f"CMD: {cmd['name']}.") + logger.debug("CMD: %s.", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if all(dep.exists() for dep in deps): cmd_hash = get_command_hash("", "", deps, cmd["script"]) for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) logger.debug( - f"URL: {url} for {output_path} with command hash {cmd_hash}" + "URL: %s for %s with command hash %s", + url, + output_path, + cmd_hash, ) yield url, output_path @@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): commands.pop(i) break else: - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") + logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) else: # If we didn't break the for loop, break the while loop. break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py index bc779e9cd..a8178de21 100644 --- a/spacy/cli/project/push.py +++ b/spacy/cli/project/push.py @@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) for cmd in config.get("commands", []): - logger.debug(f"CMD: cmd['name']") + logger.debug("CMD: %s", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if any(not dep.exists() for dep in deps): - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") + logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) continue cmd_hash = get_command_hash( "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] ) - logger.debug(f"CMD_HASH: {cmd_hash}") + logger.debug("CMD_HASH: %s", cmd_hash) for output_path in cmd.get("outputs", []): output_loc = project_dir / output_path if output_loc.exists() and _is_not_empty_dir(output_loc): @@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): content_hash=get_content_hash(output_loc), ) logger.debug( - f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" + "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash ) yield output_path, url diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/errors.py b/spacy/errors.py index eadbf63d6..390de126e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes): E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E139 = ("Knowledge base for component '{name}' is empty. Use the methods " - "`kb.add_entity` and `kb.add_alias` to add entries.") + E139 = ("Knowledge base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes): E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " - "or use `auto_switch_port=True` to pick an available port automatically.") + "or use `auto_select_port=True` to pick an available port automatically.") # v4 error strings E4000 = ("Expected a Doc as input, but got: '{type}'") @@ -961,6 +960,7 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index edba523cf..2a74d047b 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + def is_empty(self): + return len(self) == 0 + def __len__(self): return self.get_size_entities() diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6963e8b79..28e5085a8 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer - - -# Punctuation stolen from Danish -from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES class SwedishDefaults(BaseDefaults): diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py new file mode 100644 index 000000000..67f1bcdc4 --- /dev/null +++ b/spacy/lang/sv/punctuation.py @@ -0,0 +1,33 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] +_suffixes += [r"(?<=[^sSxXzZ])\'"] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/language.py b/spacy/language.py index fb86689bc..3b86fdde7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: @registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): - util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") + util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables) lookups = load_lookups(lang=lang, tables=tables) return lookups @@ -174,8 +174,7 @@ class Language: if not isinstance(vocab, Vocab) and vocab is not True: raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) if vocab is True: - vectors_name = meta.get("vectors", {}).get("name") - vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) + vocab = create_vocab(self.lang, self.Defaults) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -229,7 +228,6 @@ class Language: "width": self.vocab.vectors_length, "vectors": len(self.vocab.vectors), "keys": self.vocab.vectors.n_keys, - "name": self.vocab.vectors.name, "mode": self.vocab.vectors.mode, } self._meta["labels"] = dict(self.pipe_labels) @@ -2074,7 +2072,7 @@ class Language: pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: - util.logger.debug(f"Replacing listeners of component '{pipe_name}'") + util.logger.debug("Replacing listeners of component '%s'", pipe_name) if len(list(listeners)) != len(pipe_listeners): # The number of listeners defined in the component model doesn't # match the listeners to replace, so we won't be able to update @@ -2197,9 +2195,6 @@ class Language: if path.exists(): data = srsly.read_json(path) self.meta.update(data) - # self.meta always overrides meta["vectors"] with the metadata - # from self.vocab.vectors, so set the name directly - self.vocab.vectors.name = data.get("vectors", {}).get("name") def deserialize_vocab(path: Path) -> None: if path.exists(): @@ -2268,9 +2263,6 @@ class Language: def deserialize_meta(b): data = srsly.json_loads(b) self.meta.update(data) - # self.meta always overrides meta["vectors"] with the metadata - # from self.vocab.vectors, so set the name directly - self.vocab.vectors.name = data.get("vectors", {}).get("name") deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers["config.cfg"] = lambda b: self.config.from_bytes( diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 4c6004907..e2a1b8a3b 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -82,8 +82,12 @@ cdef class DependencyMatcher: "$-": self._imm_left_sib, "$++": self._right_sib, "$--": self._left_sib, + ">+": self._imm_right_child, + ">-": self._imm_left_child, ">++": self._right_child, ">--": self._left_child, + "<+": self._imm_right_parent, + "<-": self._imm_left_parent, "<++": self._right_parent, "<--": self._left_parent, } @@ -427,12 +431,34 @@ cdef class DependencyMatcher: def _left_sib(self, doc, node): return [doc[child.i] for child in doc[node].head.children if child.i < node] + def _imm_right_child(self, doc, node): + for child in doc[node].children: + if child.i == node + 1: + return [doc[child.i]] + return [] + + def _imm_left_child(self, doc, node): + for child in doc[node].children: + if child.i == node - 1: + return [doc[child.i]] + return [] + def _right_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i > node] def _left_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i < node] + def _imm_right_parent(self, doc, node): + if doc[node].head.i == node + 1: + return [doc[node].head] + return [] + + def _imm_left_parent(self, doc, node): + if doc[node].head.i == node - 1: + return [doc[node].head] + return [] + def _right_parent(self, doc, node): if doc[node].head.i > node: return [doc[node].head] diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 498689a7c..17bdfd394 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store): return attr_values +def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): + # tuple order affects performance + return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) + + # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. @@ -848,7 +853,7 @@ class _FuzzyPredicate: fuzz = self.predicate[len("FUZZY"):] # number after prefix self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy_compare = fuzzy_compare - self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) def __call__(self, Token token): if self.is_extension: @@ -870,7 +875,7 @@ class _RegexPredicate: self.value = re.compile(value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -906,7 +911,7 @@ class _SetPredicate: self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -978,7 +983,7 @@ class _ComparisonPredicate: self.value = value self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, if isinstance(value, dict): for type_, cls in predicate_types.items(): if type_ in value: - key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) + key = _predicate_cache_key(attr, type_, value[type_]) if key in seen_predicates: output.append(seen_predicates[key]) else: diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..7332ca199 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx index 79be13b00..9b2114900 100644 --- a/spacy/ml/tb_framework.pyx +++ b/spacy/ml/tb_framework.pyx @@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, cdef np.ndarray step_actions scores = [] - while sizes.states >= 1: + while sizes.states >= 1 and (actions is None or len(actions) > 0): step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") step_actions = actions[0] if actions is not None else None + assert step_actions is None or step_actions.size == sizes.states, \ + f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})" with nogil: _predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) if actions is None: diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 63d5cccc2..6a187b6c3 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -58,6 +58,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, "overwrite": False, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, @@ -84,6 +85,7 @@ def make_entity_linker( get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -106,6 +108,7 @@ def make_entity_linker( get_candidates_batch ( Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -114,28 +117,9 @@ def make_entity_linker( prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - try: - from spacy_legacy.components.entity_linker import EntityLinker_v1 - except: - raise ImportError( - "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." - ) - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) + raise ValueError(Errors.E4005) + return EntityLinker( nlp.vocab, model, @@ -147,6 +131,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -188,6 +173,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -212,6 +198,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another @@ -219,6 +206,7 @@ class EntityLinker(TrainablePipe): candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/entitylinker#init """ @@ -235,6 +223,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -242,9 +231,7 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size @@ -266,7 +253,7 @@ class EntityLinker(TrainablePipe): # Raise an error if the knowledge base is not initialized. if self.kb is None: raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: + if hasattr(self.kb, "is_empty") and self.kb.is_empty(): raise ValueError(Errors.E139.format(name=self.name)) def initialize( diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index c742aaeaa..d9639f8d5 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,6 @@ -from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any +from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from thinc.types import Floats2d from itertools import islice from .trainable_pipe import TrainablePipe @@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#update """ - if losses is None: - losses = {} validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - tokvecs, bp_tokvecs = self.model.begin_update(docs) - d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - losses.setdefault(self.name, 0.0) - - def accumulate_gradient(one_d_tokvecs): - """Accumulate tok2vec loss and gradient. This is passed as a callback - to all but the last listener. Only the last one does the backprop. - """ - nonlocal d_tokvecs - for i in range(len(one_d_tokvecs)): - d_tokvecs[i] += one_d_tokvecs[i] - losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) - return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - - def backprop(one_d_tokvecs): - """Callback to actually do the backprop. Passed to last listener.""" - accumulate_gradient(one_d_tokvecs) - d_docs = bp_tokvecs(d_tokvecs) - if sgd is not None: - self.finish_update(sgd) - return d_docs - - batch_id = Tok2VecListener.get_batch_id(docs) - for listener in self.listeners[:-1]: - listener.receive(batch_id, tokvecs, accumulate_gradient) - if self.listeners: - self.listeners[-1].receive(batch_id, tokvecs, backprop) - return losses + return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses) def get_loss(self, examples, scores) -> None: pass @@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe): def add_label(self, label): raise NotImplementedError + def distill( + self, + teacher_pipe: Optional["TrainablePipe"], + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Performs an update of the student pipe's model using the + student's distillation examples and sets the annotations + of the teacher's distillation examples using the teacher pipe. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use + for prediction. + examples (Iterable[Example]): Distillation examples. The reference (teacher) + and predicted (student) docs must have the same number of tokens and the + same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/tok2vec#distill + """ + # By default we require a teacher pipe, but there are downstream + # implementations that don't require a pipe. + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + teacher_docs = [eg.reference for eg in examples] + student_docs = [eg.predicted for eg in examples] + teacher_preds = teacher_pipe.predict(teacher_docs) + teacher_pipe.set_annotations(teacher_docs, teacher_preds) + return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses) + + def _update_with_docs( + self, + docs: Iterable[Doc], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + set_dropout_rate(self.model, drop) + + tokvecs, accumulate_gradient, backprop = self._create_backprops( + docs, losses, sgd=sgd + ) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners[:-1]: + listener.receive(batch_id, tokvecs, accumulate_gradient) + if self.listeners: + self.listeners[-1].receive(batch_id, tokvecs, backprop) + return losses + + def _create_backprops( + self, + docs: Iterable[Doc], + losses: Dict[str, float], + *, + sgd: Optional[Optimizer] = None, + ) -> Tuple[Floats2d, Callable, Callable]: + tokvecs, bp_tokvecs = self.model.begin_update(docs) + d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def accumulate_gradient(one_d_tokvecs): + """Accumulate tok2vec loss and gradient. This is passed as a callback + to all but the last listener. Only the last one does the backprop. + """ + nonlocal d_tokvecs + for i in range(len(one_d_tokvecs)): + d_tokvecs[i] += one_d_tokvecs[i] + losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def backprop(one_d_tokvecs): + """Callback to actually do the backprop. Passed to last listener.""" + accumulate_gradient(one_d_tokvecs) + d_docs = bp_tokvecs(d_tokvecs) + if sgd is not None: + self.finish_update(sgd) + return d_docs + + return tokvecs, accumulate_gradient, backprop + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9e50dd7b2..2d2a36252 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -36,6 +36,11 @@ from ..errors import Errors, Warnings from .. import util +# TODO: Remove when we switch to Cython 3. +cdef extern from "" namespace "std" nogil: + bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + + + NUMPY_OPS = NumpyOps() @@ -253,8 +258,8 @@ class Parser(TrainablePipe): # batch uniform length. Since we do not have a gold standard # sequence, we use the teacher's predictions as the gold # standard. - max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states = self._init_batch(teacher_pipe, student_docs, max_moves) + max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) + states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves) else: states = self.moves.init_batch(student_docs) @@ -265,12 +270,12 @@ class Parser(TrainablePipe): # gradients of the student's transition distributions relative to the # teacher's distributions. - student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves, - max_moves=max_moves) + student_inputs = TransitionModelInputs(docs=student_docs, + states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = states2actions(student_states) + actions = _states_diff_to_actions(states, student_states) teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], - moves=self.moves, actions=actions) + states=states, moves=teacher_pipe.moves, actions=actions) (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) @@ -522,7 +527,7 @@ class Parser(TrainablePipe): set_dropout_rate(self.model, 0.0) student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = states2actions(student_states) + actions = _states_to_actions(student_states) teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) _, teacher_scores = self._rehearsal_model.predict(teacher_inputs) @@ -642,7 +647,7 @@ class Parser(TrainablePipe): raise ValueError(Errors.E149) from None return self - def _init_batch(self, teacher_step_model, docs, max_length): + def _init_batch_from_teacher(self, teacher_pipe, docs, max_length): """Make a square batch of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -651,10 +656,12 @@ class Parser(TrainablePipe): _init_gold_batch, this version uses a teacher model to generate the cut sequences.""" cdef: - StateClass start_state StateClass state - Transition action - all_states = self.moves.init_batch(docs) + TransitionSystem moves = teacher_pipe.moves + + # Start with the same heuristic as in supervised training: exclude + # docs that are within the maximum length. + all_states = moves.init_batch(docs) states = [] to_cut = [] for state, doc in zip(all_states, docs): @@ -663,18 +670,28 @@ class Parser(TrainablePipe): states.append(state) else: to_cut.append(state) + + if not to_cut: + return states + + # Parse the states that are too long with the teacher's parsing model. + teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, + states=[state.copy() for state in to_cut]) + (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs) + + # Step through the teacher's actions and store every state after + # each multiple of max_length. + teacher_actions = _states_to_actions(teacher_states) while to_cut: states.extend(state.copy() for state in to_cut) - # Move states forward max_length actions. - length = 0 - while to_cut and length < max_length: - teacher_scores = teacher_step_model.predict(to_cut) - self.transition_states(to_cut, teacher_scores) - # States that are completed do not need further cutting. - to_cut = [state for state in to_cut if not state.is_final()] - length += 1 - return states + for step_actions in teacher_actions[:max_length]: + to_cut = moves.apply_actions(to_cut, step_actions) + teacher_actions = teacher_actions[max_length:] + if len(teacher_actions) < max_length: + break + + return states def _init_gold_batch(self, examples, max_length): """Make a square batch, of length equal to the shortest transition @@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs): model.attrs[key] = value -def states2actions(states: List[StateClass]) -> List[Ints1d]: +def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: cdef int step cdef StateClass state cdef StateC* c_state @@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]: actions.append(numpy.array(step_actions, dtype="i")) return actions + +def _states_diff_to_actions( + before_states: List[StateClass], + after_states: List[StateClass] +) -> List[Ints1d]: + """ + Return for two sets of states the actions to go from the first set of + states to the second set of states. The histories of the first set of + states must be a prefix of the second set of states. + """ + cdef StateClass before_state, after_state + cdef StateC* c_state_before + cdef StateC* c_state_after + + assert len(before_states) == len(after_states) + + # Check invariant: before states histories must be prefixes of after states. + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + + assert equal(c_state_before.history.begin(), c_state_before.history.end(), + c_state_after.history.begin()) + + actions = [] + while True: + step = len(actions) + + step_actions = [] + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + if step < c_state_after.history.size() - c_state_before.history.size(): + step_actions.append(c_state_after.history[c_state_before.history.size() + step]) + + # We are done if we have exhausted all histories. + if len(step_actions) == 0: + break + + actions.append(numpy.array(step_actions, dtype="i")) + + return actions diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index bbb0ff415..0aa495992 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 + + +@pytest.mark.issue(12311) +@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"]) +def test_sv_tokenizer_handles_colon(sv_tokenizer, text): + tokens = sv_tokenizer(text) + assert len(tokens) == 1 diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index b4e19d69d..200384320 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): ("the", "brown", "$--", 0), ("brown", "the", "$--", 1), ("brown", "brown", "$--", 0), + ("over", "jumped", "<+", 0), + ("quick", "fox", "<+", 0), + ("the", "quick", "<+", 0), + ("brown", "fox", "<+", 1), ("quick", "fox", "<++", 1), ("quick", "over", "<++", 0), ("over", "jumped", "<++", 0), ("the", "fox", "<++", 2), + ("brown", "fox", "<-", 0), + ("fox", "over", "<-", 0), + ("the", "over", "<-", 0), + ("over", "jumped", "<-", 1), ("brown", "fox", "<--", 0), ("fox", "jumped", "<--", 0), ("fox", "over", "<--", 1), + ("fox", "brown", ">+", 0), + ("over", "fox", ">+", 0), + ("over", "the", ">+", 0), + ("jumped", "over", ">+", 1), ("jumped", "over", ">++", 1), ("fox", "lazy", ">++", 0), ("over", "the", ">++", 0), + ("jumped", "over", ">-", 0), + ("fox", "quick", ">-", 0), + ("brown", "quick", ">-", 0), + ("fox", "brown", ">-", 1), ("brown", "fox", ">--", 0), ("fox", "brown", ">--", 1), ("jumped", "fox", ">--", 1), diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py new file mode 100644 index 000000000..8c1cf7a93 --- /dev/null +++ b/spacy/tests/parser/test_model.py @@ -0,0 +1,61 @@ +import numpy +import pytest + +from spacy.lang.en import English +from spacy.ml.tb_framework import TransitionModelInputs +from spacy.training import Example + +TRAIN_DATA = [ + ( + "They trade mortgage-backed securities.", + { + "heads": [1, 1, 4, 4, 5, 1, 1], + "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], + }, + ), + ( + "I like London and Berlin.", + { + "heads": [1, 1, 1, 2, 2, 1], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + }, + ), +] + + +@pytest.fixture +def nlp_parser(): + nlp = English() + parser = nlp.add_pipe("parser") + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations["deps"]: + parser.add_label(dep) + nlp.initialize() + + return nlp, parser + + +def test_incorrect_number_of_actions(nlp_parser): + nlp, parser = nlp_parser + doc = nlp.make_doc("test") + + # Too many actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")] + ) + ) + + # Too few actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc, doc], + moves=parser.moves, + actions=[numpy.array([0], dtype="i")], + ) + ) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d6cd11e55..62b8f9704 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -623,7 +623,9 @@ def test_is_distillable(): assert ner.is_distillable -def test_distill(): +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): teacher = English() teacher_ner = teacher.add_pipe("ner") train_examples = [] @@ -641,6 +643,7 @@ def test_distill(): student = English() student_ner = student.add_pipe("ner") + student_ner.cfg["update_with_oracle_cut_size"] = max_moves student_ner.initialize( get_examples=lambda: train_examples, labels=teacher_ner.label_data ) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 57b6e188b..2f2fa397e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -463,7 +463,9 @@ def test_is_distillable(): assert parser.is_distillable -def test_distill(): +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): teacher = English() teacher_parser = teacher.add_pipe("parser") train_examples = [] @@ -481,6 +483,7 @@ def test_distill(): student = English() student_parser = student.add_pipe("parser") + student_parser.cfg["update_with_oracle_cut_size"] = max_moves student_parser.initialize( get_examples=lambda: train_examples, labels=teacher_parser.label_data ) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 506530591..87cacfc9d 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -353,6 +353,9 @@ def test_kb_default(nlp): """Test that the default (empty) KB is loaded upon construction""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError, match="E139"): + # this raises an error because the KB is empty + entity_linker.validate_kb() assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 # 64 is the default value from pipeline.entity_linker @@ -990,13 +993,11 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ - ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) # fmt: on def test_legacy_architectures(name, config): - from spacy_legacy.components.entity_linker import EntityLinker_v1 # Ensure that the legacy architectures still work vector_length = 3 @@ -1019,10 +1020,7 @@ def test_legacy_architectures(name, config): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - if config["@architectures"] == "spacy.EntityLinker.v1": - assert isinstance(entity_linker, EntityLinker_v1) - else: - assert isinstance(entity_linker, EntityLinker) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index ee62b1ab4..6929b76fa 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat(): assert cats1["imperative"] < 0.9 assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] + + +cfg_string_distillation = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v2" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_distillation_teacher_annotations(): + orig_config = Config().from_str(cfg_string_distillation) + teacher_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + student_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + + train_examples_teacher = [] + train_examples_student = [] + for t in TRAIN_DATA: + train_examples_teacher.append( + Example.from_dict(teacher_nlp.make_doc(t[0]), t[1]) + ) + train_examples_student.append( + Example.from_dict(student_nlp.make_doc(t[0]), t[1]) + ) + + optimizer = teacher_nlp.initialize(lambda: train_examples_teacher) + student_nlp.initialize(lambda: train_examples_student) + + # Since Language.distill creates a copy of the examples to use as + # its internal teacher/student docs, we'll need to monkey-patch the + # tok2vec pipe's distill method. + student_tok2vec = student_nlp.get_pipe("tok2vec") + student_tok2vec._old_distill = student_tok2vec.distill + + def tok2vec_distill_wrapper( + self, + teacher_pipe, + examples, + **kwargs, + ): + assert all(not eg.reference.tensor.any() for eg in examples) + out = self._old_distill(teacher_pipe, examples, **kwargs) + assert all(eg.reference.tensor.any() for eg in examples) + return out + + student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) + student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..f9d2e226b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 36129a408..4720bc4da 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -181,7 +181,7 @@ def test_issue4042_bug2(): @pytest.mark.issue(4725) def test_issue4725_1(): """Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() nlp = English(vocab=vocab) config = { "update_with_oracle_cut_size": 111, diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index dc7ce46fe..752750d33 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 648a52374..2a780fb50 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,6 +1,7 @@ import os from pathlib import Path import pytest +import srsly import subprocess from typer.testing import CliRunner from spacy.tokens import DocBin, Doc @@ -298,3 +299,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab): # Instead of checking specific wording of the output, which may change, # we'll check that this section of the debug output is present. assert "= Trainable Lemmatizer =" in result_debug_data.stdout + + +# project tests + +SAMPLE_PROJECT = { + "title": "Sample project", + "description": "This is a project for testing", + "assets": [ + { + "dest": "assets/spacy-readme.md", + "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", + "checksum": "411b2c89ccf34288fae8ed126bf652f7", + }, + { + "dest": "assets/citation.cff", + "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", + "checksum": "c996bfd80202d480eb2e592369714e5e", + "extra": True, + }, + ], + "commands": [ + { + "name": "ok", + "help": "print ok", + "script": ["python -c \"print('okokok')\""], + }, + { + "name": "create", + "help": "make a file", + "script": ["touch abc.txt"], + "outputs": ["abc.txt"], + }, + { + "name": "clean", + "help": "remove test file", + "script": ["rm abc.txt"], + }, + ], +} + +SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) + + +@pytest.fixture +def project_dir(): + with make_tempdir() as pdir: + (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) + yield pdir + + +def test_project_document(project_dir): + readme_path = project_dir / "README.md" + assert not readme_path.exists(), "README already exists" + result = CliRunner().invoke( + app, ["project", "document", str(project_dir), "-o", str(readme_path)] + ) + assert result.exit_code == 0 + assert readme_path.is_file() + text = readme_path.read_text("utf-8") + assert SAMPLE_PROJECT["description"] in text + + +def test_project_assets(project_dir): + asset_dir = project_dir / "assets" + assert not asset_dir.exists(), "Assets dir is already present" + result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" + # check that extras work + result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" + + +def test_project_run(project_dir): + # make sure dry run works + test_file = project_dir / "abc.txt" + result = CliRunner().invoke( + app, ["project", "run", "--dry", "create", str(project_dir)] + ) + assert result.exit_code == 0 + assert not test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) + assert result.exit_code == 0 + assert "okokok" in result.stdout + + +@pytest.mark.parametrize( + "options", + [ + "", + # "--sparse", + "--branch v3", + "--repo https://github.com/explosion/projects --branch v3", + ], +) +def test_project_clone(options): + with make_tempdir() as workspace: + out = workspace / "project" + target = "benchmarks/ner_conll03" + if not options: + options = [] + else: + options = options.split() + result = CliRunner().invoke( + app, ["project", "clone", target, *options, str(out)] + ) + assert result.exit_code == 0 + assert (out / "README.md").is_file() + + +def test_project_push_pull(project_dir): + proj = dict(SAMPLE_PROJECT) + remote = "xyz" + + with make_tempdir() as remote_dir: + proj["remotes"] = {remote: str(remote_dir)} + proj_text = srsly.yaml_dumps(proj) + (project_dir / "project.yml").write_text(proj_text) + + test_file = project_dir / "abc.txt" + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) + assert result.exit_code == 0 + result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) + assert result.exit_code == 0 + assert not test_file.exists() + result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 3d0905dd3..9b8c7b9c7 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -98,7 +98,7 @@ def assert_sents_error(doc): def warn_error(proc_name, proc, docs, e): logger = logging.getLogger("spacy") - logger.warning(f"Trouble with component {proc_name}.") + logger.warning("Trouble with component %s.", proc_name) @pytest.fixture diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 70835816d..ed1322908 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -84,7 +84,7 @@ def test_issue1539(): @pytest.mark.issue(1807) def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" - vocab = Vocab(vectors_name="test_issue1807") + vocab = Vocab() assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab @@ -94,13 +94,12 @@ def test_issue1807(): def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] - vocab = Vocab(vectors_name="test_issue2871") + vocab = Vocab() vocab.vectors.resize(shape=(3, 10)) vector_data = numpy.zeros((3, 10), dtype="f") for word in words: _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) - vocab.vectors.name = "dummy_vectors" assert vocab["dog"].rank == 0 assert vocab["cat"].rank == 1 assert vocab["SUFFIX"].rank == 2 @@ -125,7 +124,7 @@ def test_issue4725_2(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # or because of issues with pickling the NER (cf test_issue4725_1) - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2): def test_vocab_add_vector(): - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -356,7 +355,7 @@ def test_vocab_add_vector(): def test_vocab_prune_vectors(): - vocab = Vocab(vectors_name="test_vocab_prune_vectors") + vocab = Vocab() _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 @@ -405,7 +404,7 @@ def test_vectors_serialize(): def test_vector_is_oov(): - vocab = Vocab(vectors_name="test_vocab_is_oov") + vocab = Vocab() data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 93cd8de05..48bc21c27 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -105,6 +105,7 @@ class Doc: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., @@ -127,12 +128,12 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property - def sents(self) -> Iterator[Span]: ... + def sents(self) -> Tuple[Span]: ... @property def lang(self) -> int: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2eca1aafd..0ea2c39ab 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -520,7 +520,7 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be created. @@ -657,9 +657,6 @@ cdef class Doc: elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector else: return xp.zeros((self.vocab.vectors_length,), dtype="float32") @@ -706,10 +703,10 @@ cdef class Doc: return self.text property ents: - """The named entities in the document. Returns a tuple of named entity + """The named entities in the document. Returns a list of named entity `Span` objects, if the entity recognizer has been applied. - RETURNS (tuple): Entities in the document, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity. DOCS: https://spacy.io/api/doc#ents """ @@ -867,7 +864,7 @@ cdef class Doc: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the document. + RETURNS (Tuple[Span]): Noun chunks in the document. DOCS: https://spacy.io/api/doc#noun_chunks """ @@ -876,36 +873,35 @@ cdef class Doc: # Accumulate the result before beginning to iterate over it. This # prevents the tokenization from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenization changing, so it's okay once we have the Span - # objects. See Issue #375. + # during the iteration. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) - for span in spans: - yield span + return tuple(spans) @property def sents(self): """Iterate over the sentences in the document. Yields sentence `Span` objects. Sentence spans have no label. - YIELDS (Span): Sentences in the document. + RETURNS (Tuple[Span]): Sentences in the document. DOCS: https://spacy.io/api/doc#sents """ if not self.has_annotation("SENT_START"): raise ValueError(Errors.E030) if "sents" in self.user_hooks: - yield from self.user_hooks["sents"](self) + return tuple(self.user_hooks["sents"](self)) else: start = 0 + spans = [] for i in range(1, self.length): if self.c[i].sent_start == 1: - yield Span(self, start, i) + spans.append(Span(self, start, i)) start = i if start != self.length: - yield Span(self, start, self.length) + spans.append(Span(self, start, self.length)) + return tuple(spans) @property def lang(self): @@ -1605,7 +1601,7 @@ cdef class Doc: for span_group in doc_json.get("spans", {}): spans = [] for span in doc_json["spans"][span_group]: - char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"]) + char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"]) if char_span is None: raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"])) spans.append(char_span) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 979e74e7e..e5031fea9 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -74,6 +74,8 @@ class Span: @property def ents(self) -> Tuple[Span]: ... @property + def sents(self) -> Tuple[Span]: ... + @property def has_vector(self) -> bool: ... @property def vector(self) -> Floats1d: ... @@ -86,7 +88,7 @@ class Span: @property def text_with_ws(self) -> str: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property def root(self) -> Token: ... def char_span( @@ -94,9 +96,9 @@ class Span: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., - id: Union[int, str] = ..., alignment_mode: str = ..., span_id: Union[int, str] = ..., ) -> Span: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index aefea4f71..75f7db7ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -134,10 +134,8 @@ cdef class Span: else: return True - cdef SpanC* span_c = self.span_c() - cdef SpanC* other_span_c = other.span_c() - self_tuple = (span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, self.id, self.doc) - other_tuple = (other_span_c.start_char, other_span_c.end_char, other_span_c.label, other_span_c.kb_id, other.id, other.doc) + self_tuple = self._cmp_tuple() + other_tuple = other._cmp_tuple() # < if op == 0: return self_tuple < other_tuple @@ -158,8 +156,20 @@ cdef class Span: return self_tuple >= other_tuple def __hash__(self): + return hash(self._cmp_tuple()) + + def _cmp_tuple(self): cdef SpanC* span_c = self.span_c() - return hash((self.doc, span_c.start_char, span_c.end_char, span_c.label, span_c.kb_id, span_c.id)) + return ( + span_c.start_char, + span_c.end_char, + span_c.start, + span_c.end, + span_c.label, + span_c.kb_id, + span_c.id, + self.doc, + ) def __len__(self): """Get the number of tokens in the span. @@ -451,20 +461,21 @@ cdef class Span: """Obtain the sentences that contain this span. If the given span crosses sentence boundaries, return all sentences it is a part of. - RETURNS (Iterable[Span]): All sentences that the span is a part of. + RETURNS (Tuple[Span]): All sentences that the span is a part of. - DOCS: https://spacy.io/api/span#sents + DOCS: https://spacy.io/api/span#sents """ cdef int start cdef int i if "sents" in self.doc.user_span_hooks: - yield from self.doc.user_span_hooks["sents"](self) - elif "sents" in self.doc.user_hooks: + return tuple(self.doc.user_span_hooks["sents"](self)) + spans = [] + if "sents" in self.doc.user_hooks: for sentence in self.doc.user_hooks["sents"](self.doc): if sentence.end > self.start: if sentence.start < self.end or sentence.start == self.start == self.end: - yield sentence + spans.append(sentence) else: break else: @@ -479,12 +490,13 @@ cdef class Span: # Now, find all the sentences in the span for i in range(start + 1, self.doc.length): if self.doc.c[i].sent_start == 1: - yield Span(self.doc, start, i) + spans.append(Span(self.doc, start, i)) start = i if start >= self.end: break if start < self.end: - yield Span(self.doc, start, self.end) + spans.append(Span(self.doc, start, self.end)) + return tuple(spans) @property @@ -492,7 +504,7 @@ cdef class Span: """The named entities that fall completely within the span. Returns a tuple of `Span` objects. - RETURNS (tuple): Entities in the span, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity. DOCS: https://spacy.io/api/span#ents """ @@ -507,7 +519,7 @@ cdef class Span: ents.append(ent) else: break - return ents + return tuple(ents) @property def has_vector(self): @@ -522,8 +534,6 @@ cdef class Span: return self.doc.user_span_hooks["has_vector"](self) elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) - elif self.doc.tensor.size > 0: - return True else: return False @@ -605,13 +615,15 @@ cdef class Span: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the span. + RETURNS (Tuple[Span]): Noun chunks in the span. DOCS: https://spacy.io/api/span#noun_chunks """ + spans = [] for span in self.doc.noun_chunks: if span.start >= self.start and span.end <= self.end: - yield span + spans.append(span) + return tuple(spans) @property def root(self): @@ -656,17 +668,16 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. - start (int): The index of the first character of the span. - end (int): The index of the first character after the span. + start_idx (int): The index of the first character of the span. + end_idx (int): The index of the first character after the span. label (Union[int, str]): A label to attach to the Span, e.g. for named entities. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. - id (Union[int, str]): Unused. alignment_mode (str): How character indices are aligned to token boundaries. Options: "strict" (character indices must be aligned with token boundaries), "contract" (span of all tokens completely diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 64c707acd..74f812af7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -389,8 +389,6 @@ cdef class Token: """ if "has_vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["has_vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return True return self.vocab.has_vector(self.c.lex.orth) @property @@ -404,8 +402,6 @@ cdef class Token: """ if "vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return self.doc.tensor[self.i] else: return self.vocab.get_vector(self.c.lex.orth) diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 426fddf90..7e2494f5b 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -11,7 +11,7 @@ def create_copy_from_base_model( ) -> Callable[[Language], Language]: def copy_from_base_model(nlp): if tokenizer: - logger.info(f"Copying tokenizer from: {tokenizer}") + logger.info("Copying tokenizer from: %s", tokenizer) base_nlp = load_model(tokenizer) if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) @@ -23,7 +23,7 @@ def create_copy_from_base_model( ) ) if vocab: - logger.info(f"Copying vocab from: {vocab}") + logger.info("Copying vocab from: %s", vocab) # only reload if the vocab is from a different model if tokenizer != vocab: base_nlp = load_model(vocab) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index d626ad0e0..086ad831c 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -29,7 +29,7 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: if path is None: raise ValueError(Errors.E913) - util.logger.debug(f"Loading corpus from path: {path}") + util.logger.debug("Loading corpus from path: %s", path) return Corpus( path, gold_preproc=gold_preproc, diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 6304e4a84..c626cb813 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + logger.info("Pipeline: %s", nlp.pipe_names) if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + logger.info("Resuming training for: %s", resume_components) nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() @@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": if T["max_epochs"] == -1: sample_size = 100 logger.debug( - f"Due to streamed train corpus, using only first {sample_size} " - f"examples for initialization. If necessary, provide all labels " - f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + "Due to streamed train corpus, using only first %s examples for initialization. " + "If necessary, provide all labels in [initialize]. " + "More info: https://spacy.io/api/cli#init_labels", + sample_size, ) nlp.initialize( lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer ) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.info(f"Initialized pipeline components: {nlp.pipe_names}") + logger.info("Initialized pipeline components: %s", nlp.pipe_names) # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: for listener in getattr( @@ -109,7 +110,7 @@ def init_vocab( ) -> None: if lookups: nlp.vocab.lookups = lookups - logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -125,11 +126,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.info(f"Added vectors: {vectors}") + logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) @@ -191,7 +192,7 @@ def init_tok2vec( if weights_data is not None: layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) - logger.info(f"Loaded pretrained weights from {init_tok2vec}") + logger.info("Loaded pretrained weights from %s", init_tok2vec) return True return False @@ -202,7 +203,6 @@ def convert_vectors( *, truncate: int, prune: int, - name: Optional[str] = None, mode: str = VectorsMode.default, ) -> None: vectors_loc = ensure_path(vectors_loc) @@ -216,13 +216,13 @@ def convert_vectors( nlp.vocab.deduplicate_vectors() else: if vectors_loc: - logger.info(f"Reading vectors from {vectors_loc}") + logger.info("Reading vectors from %s", vectors_loc) vectors_data, vector_keys, floret_settings = read_vectors( vectors_loc, truncate, mode=mode, ) - logger.info(f"Loaded vectors from {vectors_loc}") + logger.info("Loaded vectors from %s", vectors_loc) else: vectors_data, vector_keys = (None, None) if vector_keys is not None and mode != VectorsMode.floret: @@ -241,12 +241,6 @@ def convert_vectors( strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys ) nlp.vocab.deduplicate_vectors() - if name is None: - # TODO: Is this correct? Does this matter? - nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - else: - nlp.vocab.vectors.name = name - nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune >= 1 and mode != VectorsMode.floret: nlp.vocab.prune_vectors(prune) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fcc023a0d..c737d7c01 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None: if subdir.exists(): try: shutil.rmtree(str(subdir)) - logger.debug(f"Removed existing output directory: {subdir}") + logger.debug("Removed existing output directory: %s", subdir) except Exception as e: raise IOError(Errors.E901.format(path=path)) from e diff --git a/spacy/util.py b/spacy/util.py index e2ca0e6a4..1ce869152 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -33,6 +33,7 @@ import inspect import pkgutil import logging import socket +import stat try: import cupy.random @@ -55,7 +56,7 @@ if TYPE_CHECKING: # fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. @@ -139,8 +140,17 @@ class registry(thinc.registry): return func @classmethod - def find(cls, registry_name: str, func_name: str) -> Callable: - """Get info about a registered function from the registry.""" + def find( + cls, registry_name: str, func_name: str + ) -> Dict[str, Optional[Union[str, int]]]: + """Find information about a registered function, including the + module and path to the file it's defined in, the line number and the + docstring, if available. + + registry_name (str): Name of the catalogue registry. + func_name (str): Name of the registered function. + RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. + """ # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): @@ -1030,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]: """ d = Path(tempfile.mkdtemp()) yield d + + # On Windows, git clones use read-only files, which cause permission errors + # when being deleted. This forcibly fixes permissions. + def force_remove(rmfunc, path, ex): + os.chmod(path, stat.S_IWRITE) + rmfunc(path) + try: - shutil.rmtree(str(d)) + shutil.rmtree(str(d), onerror=force_remove) except PermissionError as e: warnings.warn(Warnings.W091.format(dir=d, msg=e)) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index be0f6db09..bec3ac276 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -52,7 +52,6 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors """ cdef public object strings - cdef public object name cdef readonly object mode cdef public object data cdef public object key2row @@ -64,14 +63,13 @@ cdef class Vectors: cdef readonly unicode bow cdef readonly unicode eow - def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): + def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): """Create a new vector store. strings (StringStore): The string store. shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray or cupy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (str): A name to identify the vectors table. mode (str): Vectors mode: "default" or "floret" (default: "default"). minn (int): The floret char ngram minn (default: 0). maxn (int): The floret char ngram maxn (default: 0). @@ -85,7 +83,6 @@ cdef class Vectors: self.strings = strings if self.strings is None: self.strings = StringStore() - self.name = name if mode not in Mode.values(): raise ValueError( Errors.E202.format( diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 41964703b..871044fff 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -11,7 +11,8 @@ from .vectors import Vectors from pathlib import Path def create_vocab( - lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... + lang: Optional[str], + defaults: Any, ) -> Vocab: ... class Vocab: @@ -28,7 +29,6 @@ class Vocab: strings: Optional[Union[List[str], StringStore]] = ..., lookups: Optional[Lookups] = ..., oov_prob: float = ..., - vectors_name: Optional[str] = ..., writing_system: Dict[str, Any] = ..., get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., ) -> None: ... diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a87f50ad4..f3c3595ef 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, vectors_name=None): +def create_vocab(lang, defaults): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} @@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None): lex_attr_getters=lex_attrs, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), - vectors_name=vectors_name, ) @@ -51,8 +50,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, - oov_prob=-20., vectors_name=None, writing_system={}, - get_noun_chunks=None, **deprecated_kwargs): + oov_prob=-20., writing_system={}, get_noun_chunks=None, + **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -61,7 +60,6 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. - vectors_name (str): Optional name to identify the vectors table. get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): A function that yields base noun phrases used for Doc.noun_chunks. """ @@ -78,7 +76,7 @@ cdef class Vocab: _ = self[string] self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) - self.vectors = Vectors(strings=self.strings, name=vectors_name) + self.vectors = Vectors(strings=self.strings) self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks @@ -308,7 +306,7 @@ cdef class Vocab: for key, row in self.vectors.key2row.items() } # replace vectors with deduplicated version - self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=data) for key, row in key2row.items(): self.vectors.add(key, row=row) @@ -358,7 +356,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row]) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys = ops.to_numpy(syn_keys) remap = {} diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 54b5065e8..ee41144f6 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. @@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. +### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of +[`Span`](/api/span) objects denoting named entities, and returns a list of +plausible [`Candidate`](/api/kb/#candidate) objects per specified +[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a +mention to find its potential aliases in the `KnowledgeBase`. Note that this +function is case-dependent. + ## Coreference {id="coref-architectures",tag="experimental"} A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 9777650a9..e38b53775 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`. ```bash -$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] +$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose] ``` | Name | Description | @@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | -| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | @@ -1492,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir] ### project pull {id="project-pull",tag="command"} Download all files or directories listed as `outputs` for commands, unless they -are not already present locally. When searching for files in the remote, `pull` +are already present locally. When searching for files in the remote, `pull` won't just look at the output path, but will also consider the **command string** and the **hashes of the dependencies**. For instance, let's say you've previously pushed a checkpoint to the remote, but now you've changed some diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 390034a6c..14e0916d1 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index 1a3f6179f..fca056ed0 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -214,6 +214,7 @@ alignment mode `"strict". | `start` | The index of the first character of the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | @@ -653,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer). ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the document. Yields base noun-phrase -`Span` objects, if the document has been syntactically parsed. A base noun -phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be -nested within it – so no NP-level coordination, no prepositional phrases, and no -relative clauses. +Returns a tuple of the base noun phrases in the doc, if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. To customize the noun chunk iterator in a loaded pipeline, modify [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` @@ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised. > assert chunks[1].text == "another phrase" > ``` -| Name | Description | -| ---------- | ------------------------------------- | -| **YIELDS** | Noun chunks in the document. ~~Span~~ | +| Name | Description | +| ----------- | -------------------------------------------- | +| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ | ## Doc.sents {id="sents",tag="property",model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. +Returns a tuple of the sentences in the document. Sentence spans have no label. This property is only available when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the @@ -696,9 +696,9 @@ will raise an error otherwise. > assert [s.root.text for s in sents] == ["is", "'s"] > ``` -| Name | Description | -| ---------- | ----------------------------------- | -| **YIELDS** | Sentences in the document. ~~Span~~ | +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ | ## Doc.has_vector {id="has_vector",tag="property",model="vectors"} diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 12b2f6bef..3af7ac4dd 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index e62d9c724..e1ada3b45 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -188,12 +188,12 @@ the character indices don't map to a valid span. | Name | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | +| `start_idx` | The index of the first character of the span. ~~int~~ | +| `end_idx` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `id` | Unused. ~~Union[int, str]~~ | | `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | `span_id` 3.5.1 | An identifier to associate with the span. ~~Union[int, str]~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | @@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of > assert ents[0].text == "Mr. Best" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------- | -| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------ | +| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ | ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` -objects, if the document has been syntactically parsed. A base noun phrase, or -"NP chunk", is a noun phrase that does not permit other NPs to be nested within -it – so no NP-level coordination, no prepositional phrases, and no relative -clauses. +Returns a tuple of the base noun phrases in the span if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) has not been implemeted for the given language, a `NotImplementedError` is @@ -301,9 +300,9 @@ raised. > assert chunks[0].text == "another phrase" > ``` -| Name | Description | -| ---------- | --------------------------------- | -| **YIELDS** | Noun chunks in the span. ~~Span~~ | +| Name | Description | +| ----------- | ---------------------------------------- | +| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ | ## Span.as_doc {id="as_doc",tag="method"} @@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)] ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} -Returns a generator over the sentences the span belongs to. This property is -only available when [sentence boundaries](/usage/linguistic-features#sbd) have -been set on the document by the `parser`, `senter`, `sentencizer` or some custom +Returns a tuple of the sentences the span belongs to. This property is only +available when [sentence boundaries](/usage/linguistic-features#sbd) have been +set on the document by the `parser`, `senter`, `sentencizer` or some custom function. It will raise an error otherwise. If the span happens to cross sentence boundaries, all sentences the span @@ -541,9 +540,9 @@ overlaps with will be returned. > assert len(span.sents) == 2 > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------- | -| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ | ## Attributes {id="attributes"} diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx index a1bb1265e..8b6d2380b 100644 --- a/website/docs/api/tok2vec.mdx +++ b/website/docs/api/tok2vec.mdx @@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"} + +Performs an update of the student pipe's model using the student's distillation +examples and sets the annotations of the teacher's distillation examples using +the teacher pipe. + +Unlike other trainable pipes, the student pipe doesn't directly learn its +representations from the teacher. However, since downstream pipes that do +perform distillation expect the tok2vec annotations to be present on the +correct distillation examples, we need to ensure that they are set beforehand. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("tok2vec") +> student_pipe = student.add_pipe("tok2vec") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ | +| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Tok2Vec.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index b13a6d28b..921b7a151 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Description | -| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | -| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | -| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | -| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | -| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | -| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | -| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | -| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | -| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | -| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {id="displacy_options-ent"} diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index d6033c096..021484a1b 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -52,7 +52,6 @@ modified later. | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | -| `name` | A name to identify the vectors table. ~~str~~ | | `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ | | `maxn` 3.2 | The floret char ngram maxn (default: `0`). ~~int~~ | diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 131e4ce0a..3faf1f1a0 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -27,7 +27,6 @@ Create the vocabulary. | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | -| `vectors_name` | A name to identify the vectors table. ~~str~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx index 6deab926d..39ee8e48a 100644 --- a/website/docs/usage/101/_vectors-similarity.mdx +++ b/website/docs/usage/101/_vectors-similarity.mdx @@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, To make them compact and fast, spaCy's small [pipeline packages](/models) (all -packages that end in `sm`) **don't ship with word vectors**, and only include -context-sensitive **tensors**. This means you can still use the `similarity()` -methods to compare documents, spans and tokens – but the result won't be as -good, and individual tokens won't have any vectors assigned. So in order to use -_real_ word vectors, you need to download a larger pipeline package: +packages that end in `sm`) **don't ship with word vectors**. In order to use +`similarity()`, you need to download a larger pipeline package that includes +vectors: ```diff - python -m spacy download en_core_web_sm -+ python -m spacy download en_core_web_lg ++ python -m spacy download en_core_web_md ``` +In spaCy v3 and earlier, small pipeline packages supported `similarity()` by +backing off to context-sensitive tensors from the `tok2vec` component. These +tensors do not work well for this purpose and this backoff has been removed in +spaCy v4. + Pipeline packages that come with built-in word vectors make them available as diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 0c2bd7a66..792ec119a 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} @@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline") The saved pipeline now includes the `"entity_ruler"` in its [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a -file `entityruler.jsonl` with the patterns. When you load the pipeline back in, -all pipeline components will be restored and deserialized – including the entity +file `patterns.jsonl` with the patterns. When you load the pipeline back in, all +pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful pipeline packages with binary weights _and_ rules included! diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index 1d3682af4..c372744de 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -58,12 +58,12 @@ arcs. -| Argument | Description | -| --------- | ----------------------------------------------------------------------------------------- | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| Argument | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options).