diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 70882c3cc..555322782 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -16,7 +16,7 @@ jobs: with: ref: ${{ github.head_ref }} - uses: actions/setup-python@v4 - - run: pip install black + - run: pip install black -c requirements.txt - name: Auto-format code if needed run: black spacy # We can't run black --check here because that returns a non-zero excit diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1a7c0c9a4..3c0b27c1d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its Python modules. If you've built spaCy from source, you'll already have both tools installed. +As a general rule of thumb, we use f-strings for any formatting of strings. +One exception are calls to Python's `logging` functionality. +To avoid unnecessary string conversions in these cases, we use string formatting +templates with `%s` and `%d` etc. + **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a6a575315..9b7ebbe01 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -41,7 +41,7 @@ jobs: inputs: versionSpec: "3.8" - script: | - pip install black==22.3.0 + pip install black -c requirements.txt python -m black spacy --check displayName: "black" - script: | diff --git a/requirements.txt b/requirements.txt index 78cccfbf1..6f4b61918 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.990,<0.1000; platform_machine != "aarch64" +mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" +types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=22.0,<23.0 +black==22.3.0 diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 42883f896..d763fba1f 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -90,9 +90,9 @@ def parse_config_overrides( cli_overrides = _parse_overrides(args, is_cli=True) if cli_overrides: keys = [k for k in cli_overrides if k not in env_overrides] - logger.debug(f"Config overrides from CLI: {keys}") + logger.debug("Config overrides from CLI: %s", keys) if env_overrides: - logger.debug(f"Config overrides from env variables: {list(env_overrides)}") + logger.debug("Config overrides from env variables: %s", list(env_overrides)) return {**cli_overrides, **env_overrides} diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 324c5d1bb..6351f28eb 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -252,7 +252,7 @@ def get_third_party_dependencies( raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file - modules.add(func_info["module"].split(".")[0]) # type: ignore[index] + modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr] dependencies = [] for module_name in modules: if module_name in distributions: diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 6e3cde88c..8894baa50 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # in the list. while commands: for i, cmd in enumerate(list(commands)): - logger.debug(f"CMD: {cmd['name']}.") + logger.debug("CMD: %s.", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if all(dep.exists() for dep in deps): cmd_hash = get_command_hash("", "", deps, cmd["script"]) for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) logger.debug( - f"URL: {url} for {output_path} with command hash {cmd_hash}" + "URL: %s for %s with command hash %s", + url, + output_path, + cmd_hash, ) yield url, output_path @@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): commands.pop(i) break else: - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") + logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) else: # If we didn't break the for loop, break the while loop. break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py index bc779e9cd..a8178de21 100644 --- a/spacy/cli/project/push.py +++ b/spacy/cli/project/push.py @@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) for cmd in config.get("commands", []): - logger.debug(f"CMD: cmd['name']") + logger.debug("CMD: %s", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if any(not dep.exists() for dep in deps): - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") + logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) continue cmd_hash = get_command_hash( "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] ) - logger.debug(f"CMD_HASH: {cmd_hash}") + logger.debug("CMD_HASH: %s", cmd_hash) for output_path in cmd.get("outputs", []): output_loc = project_dir / output_path if output_loc.exists() and _is_not_empty_dir(output_loc): @@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): content_hash=get_content_hash(output_loc), ) logger.debug( - f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" + "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash ) yield output_path, url diff --git a/spacy/errors.py b/spacy/errors.py index eadbf63d6..5049100d8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes): E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E139 = ("Knowledge base for component '{name}' is empty. Use the methods " - "`kb.add_entity` and `kb.add_alias` to add entries.") + E139 = ("Knowledge base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes): E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " - "or use `auto_switch_port=True` to pick an available port automatically.") + "or use `auto_select_port=True` to pick an available port automatically.") # v4 error strings E4000 = ("Expected a Doc as input, but got: '{type}'") diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index edba523cf..2a74d047b 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + def is_empty(self): + return len(self) == 0 + def __len__(self): return self.get_size_entities() diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6963e8b79..28e5085a8 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer - - -# Punctuation stolen from Danish -from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES class SwedishDefaults(BaseDefaults): diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py new file mode 100644 index 000000000..67f1bcdc4 --- /dev/null +++ b/spacy/lang/sv/punctuation.py @@ -0,0 +1,33 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] +_suffixes += [r"(?<=[^sSxXzZ])\'"] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/language.py b/spacy/language.py index 13a3d101a..3b86fdde7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: @registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): - util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") + util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables) lookups = load_lookups(lang=lang, tables=tables) return lookups @@ -2072,7 +2072,7 @@ class Language: pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: - util.logger.debug(f"Replacing listeners of component '{pipe_name}'") + util.logger.debug("Replacing listeners of component '%s'", pipe_name) if len(list(listeners)) != len(pipe_listeners): # The number of listeners defined in the component model doesn't # match the listeners to replace, so we won't be able to update diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 4c6004907..e2a1b8a3b 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -82,8 +82,12 @@ cdef class DependencyMatcher: "$-": self._imm_left_sib, "$++": self._right_sib, "$--": self._left_sib, + ">+": self._imm_right_child, + ">-": self._imm_left_child, ">++": self._right_child, ">--": self._left_child, + "<+": self._imm_right_parent, + "<-": self._imm_left_parent, "<++": self._right_parent, "<--": self._left_parent, } @@ -427,12 +431,34 @@ cdef class DependencyMatcher: def _left_sib(self, doc, node): return [doc[child.i] for child in doc[node].head.children if child.i < node] + def _imm_right_child(self, doc, node): + for child in doc[node].children: + if child.i == node + 1: + return [doc[child.i]] + return [] + + def _imm_left_child(self, doc, node): + for child in doc[node].children: + if child.i == node - 1: + return [doc[child.i]] + return [] + def _right_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i > node] def _left_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i < node] + def _imm_right_parent(self, doc, node): + if doc[node].head.i == node + 1: + return [doc[node].head] + return [] + + def _imm_left_parent(self, doc, node): + if doc[node].head.i == node - 1: + return [doc[node].head] + return [] + def _right_parent(self, doc, node): if doc[node].head.i > node: return [doc[node].head] diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 498689a7c..17bdfd394 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store): return attr_values +def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): + # tuple order affects performance + return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) + + # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. @@ -848,7 +853,7 @@ class _FuzzyPredicate: fuzz = self.predicate[len("FUZZY"):] # number after prefix self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy_compare = fuzzy_compare - self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) def __call__(self, Token token): if self.is_extension: @@ -870,7 +875,7 @@ class _RegexPredicate: self.value = re.compile(value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -906,7 +911,7 @@ class _SetPredicate: self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -978,7 +983,7 @@ class _ComparisonPredicate: self.value = value self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, if isinstance(value, dict): for type_, cls in predicate_types.items(): if type_ in value: - key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) + key = _predicate_cache_key(attr, type_, value[type_]) if key in seen_predicates: output.append(seen_predicates[key]) else: diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..7332ca199 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 63d5cccc2..cd13a4b21 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -58,6 +58,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, "overwrite": False, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, @@ -84,6 +85,7 @@ def make_entity_linker( get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -106,6 +108,7 @@ def make_entity_linker( get_candidates_batch ( Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -147,6 +150,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -188,6 +192,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -212,6 +217,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another @@ -219,6 +225,7 @@ class EntityLinker(TrainablePipe): candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/entitylinker#init """ @@ -235,6 +242,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -242,9 +250,7 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size @@ -266,7 +272,7 @@ class EntityLinker(TrainablePipe): # Raise an error if the knowledge base is not initialized. if self.kb is None: raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: + if hasattr(self.kb, "is_empty") and self.kb.is_empty(): raise ValueError(Errors.E139.format(name=self.name)) def initialize( diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index bbb0ff415..0aa495992 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 + + +@pytest.mark.issue(12311) +@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"]) +def test_sv_tokenizer_handles_colon(sv_tokenizer, text): + tokens = sv_tokenizer(text) + assert len(tokens) == 1 diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index b4e19d69d..200384320 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): ("the", "brown", "$--", 0), ("brown", "the", "$--", 1), ("brown", "brown", "$--", 0), + ("over", "jumped", "<+", 0), + ("quick", "fox", "<+", 0), + ("the", "quick", "<+", 0), + ("brown", "fox", "<+", 1), ("quick", "fox", "<++", 1), ("quick", "over", "<++", 0), ("over", "jumped", "<++", 0), ("the", "fox", "<++", 2), + ("brown", "fox", "<-", 0), + ("fox", "over", "<-", 0), + ("the", "over", "<-", 0), + ("over", "jumped", "<-", 1), ("brown", "fox", "<--", 0), ("fox", "jumped", "<--", 0), ("fox", "over", "<--", 1), + ("fox", "brown", ">+", 0), + ("over", "fox", ">+", 0), + ("over", "the", ">+", 0), + ("jumped", "over", ">+", 1), ("jumped", "over", ">++", 1), ("fox", "lazy", ">++", 0), ("over", "the", ">++", 0), + ("jumped", "over", ">-", 0), + ("fox", "quick", ">-", 0), + ("brown", "quick", ">-", 0), + ("fox", "brown", ">-", 1), ("brown", "fox", ">--", 0), ("fox", "brown", ">--", 1), ("jumped", "fox", ">--", 1), diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 506530591..ed84ce674 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -353,6 +353,9 @@ def test_kb_default(nlp): """Test that the default (empty) KB is loaded upon construction""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError, match="E139"): + # this raises an error because the KB is empty + entity_linker.validate_kb() assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 # 64 is the default value from pipeline.entity_linker diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..f9d2e226b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 40100412a..8aaadf686 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,5 +1,7 @@ import os from pathlib import Path +import pytest +import srsly from typer.testing import CliRunner from spacy.tokens import DocBin, Doc @@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab): # Instead of checking specific wording of the output, which may change, # we'll check that this section of the debug output is present. assert "= Trainable Lemmatizer =" in result_debug_data.stdout + + +# project tests + +SAMPLE_PROJECT = { + "title": "Sample project", + "description": "This is a project for testing", + "assets": [ + { + "dest": "assets/spacy-readme.md", + "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", + "checksum": "411b2c89ccf34288fae8ed126bf652f7", + }, + { + "dest": "assets/citation.cff", + "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", + "checksum": "c996bfd80202d480eb2e592369714e5e", + "extra": True, + }, + ], + "commands": [ + { + "name": "ok", + "help": "print ok", + "script": ["python -c \"print('okokok')\""], + }, + { + "name": "create", + "help": "make a file", + "script": ["touch abc.txt"], + "outputs": ["abc.txt"], + }, + { + "name": "clean", + "help": "remove test file", + "script": ["rm abc.txt"], + }, + ], +} + +SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) + + +@pytest.fixture +def project_dir(): + with make_tempdir() as pdir: + (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) + yield pdir + + +def test_project_document(project_dir): + readme_path = project_dir / "README.md" + assert not readme_path.exists(), "README already exists" + result = CliRunner().invoke( + app, ["project", "document", str(project_dir), "-o", str(readme_path)] + ) + assert result.exit_code == 0 + assert readme_path.is_file() + text = readme_path.read_text("utf-8") + assert SAMPLE_PROJECT["description"] in text + + +def test_project_assets(project_dir): + asset_dir = project_dir / "assets" + assert not asset_dir.exists(), "Assets dir is already present" + result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" + # check that extras work + result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" + + +def test_project_run(project_dir): + # make sure dry run works + test_file = project_dir / "abc.txt" + result = CliRunner().invoke( + app, ["project", "run", "--dry", "create", str(project_dir)] + ) + assert result.exit_code == 0 + assert not test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) + assert result.exit_code == 0 + assert "okokok" in result.stdout + + +@pytest.mark.parametrize( + "options", + [ + "", + # "--sparse", + "--branch v3", + "--repo https://github.com/explosion/projects --branch v3", + ], +) +def test_project_clone(options): + with make_tempdir() as workspace: + out = workspace / "project" + target = "benchmarks/ner_conll03" + if not options: + options = [] + else: + options = options.split() + result = CliRunner().invoke( + app, ["project", "clone", target, *options, str(out)] + ) + assert result.exit_code == 0 + assert (out / "README.md").is_file() + + +def test_project_push_pull(project_dir): + proj = dict(SAMPLE_PROJECT) + remote = "xyz" + + with make_tempdir() as remote_dir: + proj["remotes"] = {remote: str(remote_dir)} + proj_text = srsly.yaml_dumps(proj) + (project_dir / "project.yml").write_text(proj_text) + + test_file = project_dir / "abc.txt" + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) + assert result.exit_code == 0 + result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) + assert result.exit_code == 0 + assert not test_file.exists() + result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 3d0905dd3..9b8c7b9c7 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -98,7 +98,7 @@ def assert_sents_error(doc): def warn_error(proc_name, proc, docs, e): logger = logging.getLogger("spacy") - logger.warning(f"Trouble with component {proc_name}.") + logger.warning("Trouble with component %s.", proc_name) @pytest.fixture diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 426fddf90..7e2494f5b 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -11,7 +11,7 @@ def create_copy_from_base_model( ) -> Callable[[Language], Language]: def copy_from_base_model(nlp): if tokenizer: - logger.info(f"Copying tokenizer from: {tokenizer}") + logger.info("Copying tokenizer from: %s", tokenizer) base_nlp = load_model(tokenizer) if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) @@ -23,7 +23,7 @@ def create_copy_from_base_model( ) ) if vocab: - logger.info(f"Copying vocab from: {vocab}") + logger.info("Copying vocab from: %s", vocab) # only reload if the vocab is from a different model if tokenizer != vocab: base_nlp = load_model(vocab) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index d626ad0e0..086ad831c 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -29,7 +29,7 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: if path is None: raise ValueError(Errors.E913) - util.logger.debug(f"Loading corpus from path: {path}") + util.logger.debug("Loading corpus from path: %s", path) return Corpus( path, gold_preproc=gold_preproc, diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 408acdbee..c626cb813 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + logger.info("Pipeline: %s", nlp.pipe_names) if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + logger.info("Resuming training for: %s", resume_components) nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() @@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": if T["max_epochs"] == -1: sample_size = 100 logger.debug( - f"Due to streamed train corpus, using only first {sample_size} " - f"examples for initialization. If necessary, provide all labels " - f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + "Due to streamed train corpus, using only first %s examples for initialization. " + "If necessary, provide all labels in [initialize]. " + "More info: https://spacy.io/api/cli#init_labels", + sample_size, ) nlp.initialize( lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer ) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.info(f"Initialized pipeline components: {nlp.pipe_names}") + logger.info("Initialized pipeline components: %s", nlp.pipe_names) # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: for listener in getattr( @@ -109,7 +110,7 @@ def init_vocab( ) -> None: if lookups: nlp.vocab.lookups = lookups - logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -125,11 +126,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.info(f"Added vectors: {vectors}") + logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) @@ -191,7 +192,7 @@ def init_tok2vec( if weights_data is not None: layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) - logger.info(f"Loaded pretrained weights from {init_tok2vec}") + logger.info("Loaded pretrained weights from %s", init_tok2vec) return True return False @@ -215,13 +216,13 @@ def convert_vectors( nlp.vocab.deduplicate_vectors() else: if vectors_loc: - logger.info(f"Reading vectors from {vectors_loc}") + logger.info("Reading vectors from %s", vectors_loc) vectors_data, vector_keys, floret_settings = read_vectors( vectors_loc, truncate, mode=mode, ) - logger.info(f"Loaded vectors from {vectors_loc}") + logger.info("Loaded vectors from %s", vectors_loc) else: vectors_data, vector_keys = (None, None) if vector_keys is not None and mode != VectorsMode.floret: diff --git a/spacy/training/loop.py b/spacy/training/loop.py index fcc023a0d..c737d7c01 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None: if subdir.exists(): try: shutil.rmtree(str(subdir)) - logger.debug(f"Removed existing output directory: {subdir}") + logger.debug("Removed existing output directory: %s", subdir) except Exception as e: raise IOError(Errors.E901.format(path=path)) from e diff --git a/spacy/util.py b/spacy/util.py index 2ce2e5e0f..1ce869152 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -33,6 +33,7 @@ import inspect import pkgutil import logging import socket +import stat try: import cupy.random @@ -55,7 +56,7 @@ if TYPE_CHECKING: # fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. @@ -139,8 +140,17 @@ class registry(thinc.registry): return func @classmethod - def find(cls, registry_name: str, func_name: str) -> Callable: - """Get info about a registered function from the registry.""" + def find( + cls, registry_name: str, func_name: str + ) -> Dict[str, Optional[Union[str, int]]]: + """Find information about a registered function, including the + module and path to the file it's defined in, the line number and the + docstring, if available. + + registry_name (str): Name of the catalogue registry. + func_name (str): Name of the registered function. + RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. + """ # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): @@ -1028,11 +1038,19 @@ def make_tempdir() -> Generator[Path, None, None]: YIELDS (Path): The path of the temp directory. """ + d = Path(tempfile.mkdtemp()) + yield d + + # On Windows, git clones use read-only files, which cause permission errors + # when being deleted. This forcibly fixes permissions. + def force_remove(rmfunc, path, ex): + os.chmod(path, stat.S_IWRITE) + rmfunc(path) + try: - with tempfile.TemporaryDirectory() as td: - yield Path(td) + shutil.rmtree(str(d), onerror=force_remove) except PermissionError as e: - warnings.warn(Warnings.W091.format(dir=td, msg=e)) + warnings.warn(Warnings.W091.format(dir=d, msg=e)) def is_cwd(path: Union[Path, str]) -> bool: diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 54b5065e8..ee41144f6 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. @@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. +### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of +[`Span`](/api/span) objects denoting named entities, and returns a list of +plausible [`Candidate`](/api/kb/#candidate) objects per specified +[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a +mention to find its potential aliases in the `KnowledgeBase`. Note that this +function is case-dependent. + ## Coreference {id="coref-architectures",tag="experimental"} A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 868079e8c..1a3f15e48 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir] ### project pull {id="project-pull",tag="command"} Download all files or directories listed as `outputs` for commands, unless they -are not already present locally. When searching for files in the remote, `pull` +are already present locally. When searching for files in the remote, `pull` won't just look at the output path, but will also consider the **command string** and the **hashes of the dependencies**. For instance, let's say you've previously pushed a checkpoint to the remote, but now you've changed some diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 390034a6c..14e0916d1 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 12b2f6bef..3af7ac4dd 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,20 +53,22 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index b13a6d28b..921b7a151 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Description | -| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | -| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | -| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | -| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | -| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | -| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | -| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | -| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | -| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | -| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {id="displacy_options-ent"} diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 0c2bd7a66..792ec119a 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} @@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline") The saved pipeline now includes the `"entity_ruler"` in its [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a -file `entityruler.jsonl` with the patterns. When you load the pipeline back in, -all pipeline components will be restored and deserialized – including the entity +file `patterns.jsonl` with the patterns. When you load the pipeline back in, all +pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful pipeline packages with binary weights _and_ rules included! diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index 1d3682af4..c372744de 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -58,12 +58,12 @@ arcs. -| Argument | Description | -| --------- | ----------------------------------------------------------------------------------------- | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| Argument | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options).