From 79ef6cf0f9ca75468457c86d0d6fd0d8709a9308 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 2 Feb 2023 11:15:22 +0100 Subject: [PATCH 01/32] Have logging calls use string formatting types (#12215) * change logging call for spacy.LookupsDataLoader.v1 * substitutions in language and _util * various more substitutions * add string formatting guidelines to contribution guidelines --- CONTRIBUTING.md | 5 +++++ spacy/cli/_util.py | 4 ++-- spacy/cli/project/pull.py | 9 ++++++--- spacy/cli/project/push.py | 8 ++++---- spacy/language.py | 4 ++-- spacy/tests/test_language.py | 2 +- spacy/training/callbacks.py | 4 ++-- spacy/training/corpus.py | 2 +- spacy/training/initialize.py | 25 +++++++++++++------------ spacy/training/loop.py | 2 +- 10 files changed, 37 insertions(+), 28 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1f396bd71..f6f6dab59 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its Python modules. If you've built spaCy from source, you'll already have both tools installed. +As a general rule of thumb, we use f-strings for any formatting of strings. +One exception are calls to Python's `logging` functionality. +To avoid unnecessary string conversions in these cases, we use string formatting +templates with `%s` and `%d` etc. + **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index ba3892b1d..f104feff9 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -90,9 +90,9 @@ def parse_config_overrides( cli_overrides = _parse_overrides(args, is_cli=True) if cli_overrides: keys = [k for k in cli_overrides if k not in env_overrides] - logger.debug(f"Config overrides from CLI: {keys}") + logger.debug("Config overrides from CLI: %s", keys) if env_overrides: - logger.debug(f"Config overrides from env variables: {list(env_overrides)}") + logger.debug("Config overrides from env variables: %s", list(env_overrides)) return {**cli_overrides, **env_overrides} diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 6e3cde88c..8894baa50 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # in the list. while commands: for i, cmd in enumerate(list(commands)): - logger.debug(f"CMD: {cmd['name']}.") + logger.debug("CMD: %s.", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if all(dep.exists() for dep in deps): cmd_hash = get_command_hash("", "", deps, cmd["script"]) for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) logger.debug( - f"URL: {url} for {output_path} with command hash {cmd_hash}" + "URL: %s for %s with command hash %s", + url, + output_path, + cmd_hash, ) yield url, output_path @@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): commands.pop(i) break else: - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") + logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) else: # If we didn't break the for loop, break the while loop. break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py index bc779e9cd..a8178de21 100644 --- a/spacy/cli/project/push.py +++ b/spacy/cli/project/push.py @@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) for cmd in config.get("commands", []): - logger.debug(f"CMD: cmd['name']") + logger.debug("CMD: %s", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if any(not dep.exists() for dep in deps): - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") + logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) continue cmd_hash = get_command_hash( "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] ) - logger.debug(f"CMD_HASH: {cmd_hash}") + logger.debug("CMD_HASH: %s", cmd_hash) for output_path in cmd.get("outputs", []): output_loc = project_dir / output_path if output_loc.exists() and _is_not_empty_dir(output_loc): @@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): content_hash=get_content_hash(output_loc), ) logger.debug( - f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" + "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash ) yield output_path, url diff --git a/spacy/language.py b/spacy/language.py index e0abfd5e7..9fdcf6328 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: @registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): - util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") + util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables) lookups = load_lookups(lang=lang, tables=tables) return lookups @@ -1969,7 +1969,7 @@ class Language: pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: - util.logger.debug(f"Replacing listeners of component '{pipe_name}'") + util.logger.debug("Replacing listeners of component '%s'", pipe_name) if len(list(listeners)) != len(pipe_listeners): # The number of listeners defined in the component model doesn't # match the listeners to replace, so we won't be able to update diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 03790eb86..236856dad 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -46,7 +46,7 @@ def assert_sents_error(doc): def warn_error(proc_name, proc, docs, e): logger = logging.getLogger("spacy") - logger.warning(f"Trouble with component {proc_name}.") + logger.warning("Trouble with component %s.", proc_name) @pytest.fixture diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 426fddf90..7e2494f5b 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -11,7 +11,7 @@ def create_copy_from_base_model( ) -> Callable[[Language], Language]: def copy_from_base_model(nlp): if tokenizer: - logger.info(f"Copying tokenizer from: {tokenizer}") + logger.info("Copying tokenizer from: %s", tokenizer) base_nlp = load_model(tokenizer) if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) @@ -23,7 +23,7 @@ def create_copy_from_base_model( ) ) if vocab: - logger.info(f"Copying vocab from: {vocab}") + logger.info("Copying vocab from: %s", vocab) # only reload if the vocab is from a different model if tokenizer != vocab: base_nlp = load_model(vocab) diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index d626ad0e0..086ad831c 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -29,7 +29,7 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: if path is None: raise ValueError(Errors.E913) - util.logger.debug(f"Loading corpus from path: {path}") + util.logger.debug("Loading corpus from path: %s", path) return Corpus( path, gold_preproc=gold_preproc, diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 6304e4a84..e90617852 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + logger.info("Pipeline: %s", nlp.pipe_names) if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + logger.info("Resuming training for: %s", resume_components) nlp.resume_training(sgd=optimizer) # Make sure that listeners are defined before initializing further nlp._link_components() @@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": if T["max_epochs"] == -1: sample_size = 100 logger.debug( - f"Due to streamed train corpus, using only first {sample_size} " - f"examples for initialization. If necessary, provide all labels " - f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + "Due to streamed train corpus, using only first %s examples for initialization. " + "If necessary, provide all labels in [initialize]. " + "More info: https://spacy.io/api/cli#init_labels", + sample_size, ) nlp.initialize( lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer ) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.info(f"Initialized pipeline components: {nlp.pipe_names}") + logger.info("Initialized pipeline components: %s", nlp.pipe_names) # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: for listener in getattr( @@ -109,7 +110,7 @@ def init_vocab( ) -> None: if lookups: nlp.vocab.lookups = lookups - logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -125,11 +126,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.info(f"Added vectors: {vectors}") + logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) @@ -191,7 +192,7 @@ def init_tok2vec( if weights_data is not None: layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) - logger.info(f"Loaded pretrained weights from {init_tok2vec}") + logger.info("Loaded pretrained weights from %s", init_tok2vec) return True return False @@ -216,13 +217,13 @@ def convert_vectors( nlp.vocab.deduplicate_vectors() else: if vectors_loc: - logger.info(f"Reading vectors from {vectors_loc}") + logger.info("Reading vectors from %s", vectors_loc) vectors_data, vector_keys, floret_settings = read_vectors( vectors_loc, truncate, mode=mode, ) - logger.info(f"Loaded vectors from {vectors_loc}") + logger.info("Loaded vectors from %s", vectors_loc) else: vectors_data, vector_keys = (None, None) if vector_keys is not None and mode != VectorsMode.floret: diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 885257772..eca40e3d9 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None: if subdir.exists(): try: shutil.rmtree(str(subdir)) - logger.debug(f"Removed existing output directory: {subdir}") + logger.debug("Removed existing output directory: %s", subdir) except Exception as e: raise IOError(Errors.E901.format(path=path)) from e From 9a454676f3ccb0e2ecd53aa82e4108b84d5f3bb4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 3 Feb 2023 11:44:10 +0100 Subject: [PATCH 02/32] Use black version constraints from requirements.txt (#12220) --- .github/workflows/autoblack.yml | 2 +- azure-pipelines.yml | 2 +- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 70882c3cc..555322782 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -16,7 +16,7 @@ jobs: with: ref: ${{ github.head_ref }} - uses: actions/setup-python@v4 - - run: pip install black + - run: pip install black -c requirements.txt - name: Auto-format code if needed run: black spacy # We can't run black --check here because that returns a non-zero excit diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 541656c3d..dba11bd1a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -41,7 +41,7 @@ jobs: inputs: versionSpec: "3.7" - script: | - pip install black==22.3.0 + pip install black -c requirements.txt python -m black spacy --check displayName: "black" - script: | diff --git a/requirements.txt b/requirements.txt index 1bd4518af..d6b0bc0dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,4 @@ types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=22.0,<23.0 +black==22.3.0 From d38a88f0f3ca97776387780c2b79711d4971b09f Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 8 Feb 2023 14:18:33 +0100 Subject: [PATCH 03/32] Remove negation. (#12252) --- website/docs/api/cli.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index bd966015e..3f31bef95 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1492,7 +1492,7 @@ $ python -m spacy project push [remote] [project_dir] ### project pull {id="project-pull",tag="command"} Download all files or directories listed as `outputs` for commands, unless they -are not already present locally. When searching for files in the remote, `pull` +are already present locally. When searching for files in the remote, `pull` won't just look at the output path, but will also consider the **command string** and the **hashes of the dependencies**. For instance, let's say you've previously pushed a checkpoint to the remote, but now you've changed some From 9d920bafcf4c03c6015deb67d6b0c335b8b04986 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Feb 2023 14:33:16 +0100 Subject: [PATCH 04/32] Extend mypy to v1.0.x (#12245) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d6b0bc0dd..bc9fc183c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7" +mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-setuptools>=57.0.0 From 2d4fb94ba0a23523cc9adb65e0dcf92bbf6177b6 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 9 Feb 2023 12:58:14 +0100 Subject: [PATCH 05/32] Fix wrong file name in docs for rule-based matcher. (#12262) --- website/docs/usage/rule-based-matching.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 08d2b3b91..628c2953f 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1442,8 +1442,8 @@ nlp.to_disk("/path/to/pipeline") The saved pipeline now includes the `"entity_ruler"` in its [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a -file `entityruler.jsonl` with the patterns. When you load the pipeline back in, -all pipeline components will be restored and deserialized – including the entity +file `patterns.jsonl` with the patterns. When you load the pipeline back in, all +pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful pipeline packages with binary weights _and_ rules included! From 61b84541378fddab19fc3507eb29745a3e5efd42 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Wed, 15 Feb 2023 12:32:53 +0100 Subject: [PATCH 06/32] Adjust return type of `registry.find` (#12227) * Fix registry find return type * add dot * Add type ignore for mypy * update black formatting version * add mypy ignore to package cli * mypy type fix (for real) * Update find description in spacy/util.py Co-authored-by: Raphael Mitsch * adjust mypy directive --------- Co-authored-by: Raphael Mitsch --- spacy/cli/package.py | 2 +- spacy/util.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 324c5d1bb..6351f28eb 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -252,7 +252,7 @@ def get_third_party_dependencies( raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file - modules.add(func_info["module"].split(".")[0]) # type: ignore[index] + modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr] dependencies = [] for module_name in modules: if module_name in distributions: diff --git a/spacy/util.py b/spacy/util.py index 8bf8fb1b0..dc7a4efe0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -144,8 +144,17 @@ class registry(thinc.registry): return func @classmethod - def find(cls, registry_name: str, func_name: str) -> Callable: - """Get info about a registered function from the registry.""" + def find( + cls, registry_name: str, func_name: str + ) -> Dict[str, Optional[Union[str, int]]]: + """Find information about a registered function, including the + module and path to the file it's defined in, the line number and the + docstring, if available. + + registry_name (str): Name of the catalogue registry. + func_name (str): Name of the registered function. + RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. + """ # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): From b95123060afdb1b357261ff7de45575d0e7d4acc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 15 Feb 2023 12:34:33 +0100 Subject: [PATCH 07/32] Make Span.char_span optional args keyword-only (#12257) * Make Span.char_span optional args keyword-only * Make kb_id and following kw-only * Format --- spacy/tokens/doc.pyi | 3 ++- spacy/tokens/doc.pyx | 4 ++-- spacy/tokens/span.pyi | 1 + spacy/tokens/span.pyx | 6 +++--- website/docs/api/doc.mdx | 1 + website/docs/api/span.mdx | 5 +++-- 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 93cd8de05..6ff61c05d 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -105,6 +105,7 @@ class Doc: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., @@ -127,7 +128,7 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property def noun_chunks(self) -> Iterator[Span]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2eca1aafd..f4836dd14 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -520,7 +520,7 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be created. @@ -1605,7 +1605,7 @@ cdef class Doc: for span_group in doc_json.get("spans", {}): spans = [] for span in doc_json["spans"][span_group]: - char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"]) + char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"]) if char_span is None: raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"])) spans.append(char_span) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 549990c5e..88cb90a17 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -94,6 +94,7 @@ class Span: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 4990cb5f7..25dbfecdf 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -666,11 +666,11 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. - start (int): The index of the first character of the span. - end (int): The index of the first character after the span. + start_idx (int): The index of the first character of the span. + end_idx (int): The index of the first character after the span. label (Union[int, str]): A label to attach to the Span, e.g. for named entities. kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index 1a3f6179f..96fe2c35a 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -214,6 +214,7 @@ alignment mode `"strict". | `start` | The index of the first character of the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 7e7042866..832501d37 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -188,9 +188,10 @@ the character indices don't map to a valid span. | Name | Description | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | +| `start_idx` | The index of the first character of the span. ~~int~~ | +| `end_idx` | The index of the last character after the span. ~~int~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | From dd3f138830f352dac59eea6683d2c9490070dabe Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 16 Feb 2023 19:08:55 +0900 Subject: [PATCH 08/32] Use tempfile.TemporaryDirectory (#12285) --- spacy/util.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index e2ca0e6a4..2ce2e5e0f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1028,12 +1028,11 @@ def make_tempdir() -> Generator[Path, None, None]: YIELDS (Path): The path of the temp directory. """ - d = Path(tempfile.mkdtemp()) - yield d try: - shutil.rmtree(str(d)) + with tempfile.TemporaryDirectory() as td: + yield Path(td) except PermissionError as e: - warnings.warn(Warnings.W091.format(dir=d, msg=e)) + warnings.warn(Warnings.W091.format(dir=td, msg=e)) def is_cwd(path: Union[Path, str]) -> bool: From 80bc140533092ab129568e85975fa6fb76f97dd4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 16 Feb 2023 17:57:02 +0100 Subject: [PATCH 09/32] Add grc to langs with lexeme norms in spacy-lookups-data (#12287) --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index dc7a4efe0..38ba7b1b5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -60,7 +60,7 @@ if TYPE_CHECKING: # fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. From e27c60a70263f7ab17968964de37e938653e37a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 21 Feb 2023 15:47:18 +0100 Subject: [PATCH 10/32] Reimplement distillation with oracle cut size (#12214) * Improve the correctness of _parse_patch * If there are no more actions, do not attempt to make further transitions, even if not all states are final. * Assert that the number of actions for a step is the same as the number of states. * Reimplement distillation with oracle cut size The code for distillation with an oracle cut size was not reimplemented after the parser refactor. We did not notice, because we did not have tests for this functionality. This change brings back the functionality and adds this to the parser tests. * Rename states2actions to _states_to_actions for consistency * Test distillation max cuts in NER * Mark parser/NER tests as slow * Typo * Fix invariant in _states_diff_to_actions * Rename _init_batch -> _init_batch_from_teacher * Ninja edit the ninja edit * Check that we raise an exception when we pass the incorrect number or actions * Remove unnecessary get Co-authored-by: Madeesh Kannan * Write out condition more explicitly --------- Co-authored-by: Madeesh Kannan --- spacy/ml/tb_framework.pyx | 4 +- spacy/pipeline/transition_parser.pyx | 101 +++++++++++++++++++++------ spacy/tests/parser/test_model.py | 61 ++++++++++++++++ spacy/tests/parser/test_ner.py | 5 +- spacy/tests/parser/test_parse.py | 5 +- 5 files changed, 152 insertions(+), 24 deletions(-) create mode 100644 spacy/tests/parser/test_model.py diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx index 79be13b00..9b2114900 100644 --- a/spacy/ml/tb_framework.pyx +++ b/spacy/ml/tb_framework.pyx @@ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, cdef np.ndarray step_actions scores = [] - while sizes.states >= 1: + while sizes.states >= 1 and (actions is None or len(actions) > 0): step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") step_actions = actions[0] if actions is not None else None + assert step_actions is None or step_actions.size == sizes.states, \ + f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})" with nogil: _predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) if actions is None: diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9e50dd7b2..2d2a36252 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -36,6 +36,11 @@ from ..errors import Errors, Warnings from .. import util +# TODO: Remove when we switch to Cython 3. +cdef extern from "" namespace "std" nogil: + bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + + + NUMPY_OPS = NumpyOps() @@ -253,8 +258,8 @@ class Parser(TrainablePipe): # batch uniform length. Since we do not have a gold standard # sequence, we use the teacher's predictions as the gold # standard. - max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states = self._init_batch(teacher_pipe, student_docs, max_moves) + max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) + states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves) else: states = self.moves.init_batch(student_docs) @@ -265,12 +270,12 @@ class Parser(TrainablePipe): # gradients of the student's transition distributions relative to the # teacher's distributions. - student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves, - max_moves=max_moves) + student_inputs = TransitionModelInputs(docs=student_docs, + states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = states2actions(student_states) + actions = _states_diff_to_actions(states, student_states) teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], - moves=self.moves, actions=actions) + states=states, moves=teacher_pipe.moves, actions=actions) (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) @@ -522,7 +527,7 @@ class Parser(TrainablePipe): set_dropout_rate(self.model, 0.0) student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) - actions = states2actions(student_states) + actions = _states_to_actions(student_states) teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) _, teacher_scores = self._rehearsal_model.predict(teacher_inputs) @@ -642,7 +647,7 @@ class Parser(TrainablePipe): raise ValueError(Errors.E149) from None return self - def _init_batch(self, teacher_step_model, docs, max_length): + def _init_batch_from_teacher(self, teacher_pipe, docs, max_length): """Make a square batch of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, @@ -651,10 +656,12 @@ class Parser(TrainablePipe): _init_gold_batch, this version uses a teacher model to generate the cut sequences.""" cdef: - StateClass start_state StateClass state - Transition action - all_states = self.moves.init_batch(docs) + TransitionSystem moves = teacher_pipe.moves + + # Start with the same heuristic as in supervised training: exclude + # docs that are within the maximum length. + all_states = moves.init_batch(docs) states = [] to_cut = [] for state, doc in zip(all_states, docs): @@ -663,18 +670,28 @@ class Parser(TrainablePipe): states.append(state) else: to_cut.append(state) + + if not to_cut: + return states + + # Parse the states that are too long with the teacher's parsing model. + teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, + states=[state.copy() for state in to_cut]) + (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs) + + # Step through the teacher's actions and store every state after + # each multiple of max_length. + teacher_actions = _states_to_actions(teacher_states) while to_cut: states.extend(state.copy() for state in to_cut) - # Move states forward max_length actions. - length = 0 - while to_cut and length < max_length: - teacher_scores = teacher_step_model.predict(to_cut) - self.transition_states(to_cut, teacher_scores) - # States that are completed do not need further cutting. - to_cut = [state for state in to_cut if not state.is_final()] - length += 1 - return states + for step_actions in teacher_actions[:max_length]: + to_cut = moves.apply_actions(to_cut, step_actions) + teacher_actions = teacher_actions[max_length:] + if len(teacher_actions) < max_length: + break + + return states def _init_gold_batch(self, examples, max_length): """Make a square batch, of length equal to the shortest transition @@ -736,7 +753,7 @@ def _change_attrs(model, **kwargs): model.attrs[key] = value -def states2actions(states: List[StateClass]) -> List[Ints1d]: +def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: cdef int step cdef StateClass state cdef StateC* c_state @@ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]: actions.append(numpy.array(step_actions, dtype="i")) return actions + +def _states_diff_to_actions( + before_states: List[StateClass], + after_states: List[StateClass] +) -> List[Ints1d]: + """ + Return for two sets of states the actions to go from the first set of + states to the second set of states. The histories of the first set of + states must be a prefix of the second set of states. + """ + cdef StateClass before_state, after_state + cdef StateC* c_state_before + cdef StateC* c_state_after + + assert len(before_states) == len(after_states) + + # Check invariant: before states histories must be prefixes of after states. + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + + assert equal(c_state_before.history.begin(), c_state_before.history.end(), + c_state_after.history.begin()) + + actions = [] + while True: + step = len(actions) + + step_actions = [] + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + if step < c_state_after.history.size() - c_state_before.history.size(): + step_actions.append(c_state_after.history[c_state_before.history.size() + step]) + + # We are done if we have exhausted all histories. + if len(step_actions) == 0: + break + + actions.append(numpy.array(step_actions, dtype="i")) + + return actions diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py new file mode 100644 index 000000000..8c1cf7a93 --- /dev/null +++ b/spacy/tests/parser/test_model.py @@ -0,0 +1,61 @@ +import numpy +import pytest + +from spacy.lang.en import English +from spacy.ml.tb_framework import TransitionModelInputs +from spacy.training import Example + +TRAIN_DATA = [ + ( + "They trade mortgage-backed securities.", + { + "heads": [1, 1, 4, 4, 5, 1, 1], + "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], + }, + ), + ( + "I like London and Berlin.", + { + "heads": [1, 1, 1, 2, 2, 1], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + }, + ), +] + + +@pytest.fixture +def nlp_parser(): + nlp = English() + parser = nlp.add_pipe("parser") + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations["deps"]: + parser.add_label(dep) + nlp.initialize() + + return nlp, parser + + +def test_incorrect_number_of_actions(nlp_parser): + nlp, parser = nlp_parser + doc = nlp.make_doc("test") + + # Too many actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")] + ) + ) + + # Too few actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc, doc], + moves=parser.moves, + actions=[numpy.array([0], dtype="i")], + ) + ) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index d6cd11e55..62b8f9704 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -623,7 +623,9 @@ def test_is_distillable(): assert ner.is_distillable -def test_distill(): +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): teacher = English() teacher_ner = teacher.add_pipe("ner") train_examples = [] @@ -641,6 +643,7 @@ def test_distill(): student = English() student_ner = student.add_pipe("ner") + student_ner.cfg["update_with_oracle_cut_size"] = max_moves student_ner.initialize( get_examples=lambda: train_examples, labels=teacher_ner.label_data ) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 57b6e188b..2f2fa397e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -463,7 +463,9 @@ def test_is_distillable(): assert parser.is_distillable -def test_distill(): +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): teacher = English() teacher_parser = teacher.add_pipe("parser") train_examples = [] @@ -481,6 +483,7 @@ def test_distill(): student = English() student_parser = student.add_pipe("parser") + student_parser.cfg["update_with_oracle_cut_size"] = max_moves student_parser.initialize( get_examples=lambda: train_examples, labels=teacher_parser.label_data ) From daedc45d050b15be8c5422aadff7b652439a562d Mon Sep 17 00:00:00 2001 From: andyjessen <62343929+andyjessen@users.noreply.github.com> Date: Thu, 23 Feb 2023 01:37:40 -0700 Subject: [PATCH 11/32] Fix FUZZY operator definition (#12318) * Fix FUZZY operator definition The default length of the FUZZY operator is 2 and not 3. * adjust edit distance in matcher usage docs too --------- Co-authored-by: svlandeg --- website/docs/usage/rule-based-matching.mdx | 4 ++-- website/docs/usage/v3-5.mdx | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 628c2953f..bad049479 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -384,10 +384,10 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum allowed edit distance directly. ```python -# Match lowercase with fuzzy matching (allows 3 edits) +# Match lowercase with fuzzy matching (allows 2 edits by default) pattern = [{"LOWER": {"FUZZY": "definitely"}}] -# Match custom attribute values with fuzzy matching (allows 3 edits) +# Match custom attribute values with fuzzy matching (allows 2 edits by default) pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] # Match with exact Levenshtein edit distance limits (allows 4 edits) diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx index 3ca64f8a2..54c976fe5 100644 --- a/website/docs/usage/v3-5.mdx +++ b/website/docs/usage/v3-5.mdx @@ -70,13 +70,13 @@ distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can be used to specify the exact number of allowed edits. ```python -# Match lowercase with fuzzy matching (allows up to 3 edits) +# Match lowercase with fuzzy matching (allows 2 edits by default) pattern = [{"LOWER": {"FUZZY": "definitely"}}] -# Match custom attribute values with fuzzy matching (allows up to 3 edits) +# Match custom attribute values with fuzzy matching (allows 2 edits by default) pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] -# Match with exact Levenshtein edit distance limits (allows up to 4 edits) +# Match with exact Levenshtein edit distance limits (allows 4 edits) pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}] ``` From 1e8bac99f3febd7c00ba53cc8efebf5d6f989a8b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 23 Feb 2023 18:22:57 +0900 Subject: [PATCH 12/32] Add tests for projects to master (#12303) * Add tests for projects to master * Fix git clone related issues on Windows * Add stat import --- spacy/tests/test_cli_app.py | 137 ++++++++++++++++++++++++++++++++++++ spacy/util.py | 10 ++- 2 files changed, 146 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 40100412a..8aaadf686 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,5 +1,7 @@ import os from pathlib import Path +import pytest +import srsly from typer.testing import CliRunner from spacy.tokens import DocBin, Doc @@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab): # Instead of checking specific wording of the output, which may change, # we'll check that this section of the debug output is present. assert "= Trainable Lemmatizer =" in result_debug_data.stdout + + +# project tests + +SAMPLE_PROJECT = { + "title": "Sample project", + "description": "This is a project for testing", + "assets": [ + { + "dest": "assets/spacy-readme.md", + "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", + "checksum": "411b2c89ccf34288fae8ed126bf652f7", + }, + { + "dest": "assets/citation.cff", + "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", + "checksum": "c996bfd80202d480eb2e592369714e5e", + "extra": True, + }, + ], + "commands": [ + { + "name": "ok", + "help": "print ok", + "script": ["python -c \"print('okokok')\""], + }, + { + "name": "create", + "help": "make a file", + "script": ["touch abc.txt"], + "outputs": ["abc.txt"], + }, + { + "name": "clean", + "help": "remove test file", + "script": ["rm abc.txt"], + }, + ], +} + +SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) + + +@pytest.fixture +def project_dir(): + with make_tempdir() as pdir: + (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) + yield pdir + + +def test_project_document(project_dir): + readme_path = project_dir / "README.md" + assert not readme_path.exists(), "README already exists" + result = CliRunner().invoke( + app, ["project", "document", str(project_dir), "-o", str(readme_path)] + ) + assert result.exit_code == 0 + assert readme_path.is_file() + text = readme_path.read_text("utf-8") + assert SAMPLE_PROJECT["description"] in text + + +def test_project_assets(project_dir): + asset_dir = project_dir / "assets" + assert not asset_dir.exists(), "Assets dir is already present" + result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" + # check that extras work + result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" + + +def test_project_run(project_dir): + # make sure dry run works + test_file = project_dir / "abc.txt" + result = CliRunner().invoke( + app, ["project", "run", "--dry", "create", str(project_dir)] + ) + assert result.exit_code == 0 + assert not test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) + assert result.exit_code == 0 + assert "okokok" in result.stdout + + +@pytest.mark.parametrize( + "options", + [ + "", + # "--sparse", + "--branch v3", + "--repo https://github.com/explosion/projects --branch v3", + ], +) +def test_project_clone(options): + with make_tempdir() as workspace: + out = workspace / "project" + target = "benchmarks/ner_conll03" + if not options: + options = [] + else: + options = options.split() + result = CliRunner().invoke( + app, ["project", "clone", target, *options, str(out)] + ) + assert result.exit_code == 0 + assert (out / "README.md").is_file() + + +def test_project_push_pull(project_dir): + proj = dict(SAMPLE_PROJECT) + remote = "xyz" + + with make_tempdir() as remote_dir: + proj["remotes"] = {remote: str(remote_dir)} + proj_text = srsly.yaml_dumps(proj) + (project_dir / "project.yml").write_text(proj_text) + + test_file = project_dir / "abc.txt" + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) + assert result.exit_code == 0 + result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) + assert result.exit_code == 0 + assert not test_file.exists() + result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() diff --git a/spacy/util.py b/spacy/util.py index 38ba7b1b5..8cc89217d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -32,6 +32,7 @@ import inspect import pkgutil import logging import socket +import stat try: import cupy.random @@ -1050,8 +1051,15 @@ def make_tempdir() -> Generator[Path, None, None]: """ d = Path(tempfile.mkdtemp()) yield d + + # On Windows, git clones use read-only files, which cause permission errors + # when being deleted. This forcibly fixes permissions. + def force_remove(rmfunc, path, ex): + os.chmod(path, stat.S_IWRITE) + rmfunc(path) + try: - shutil.rmtree(str(d)) + shutil.rmtree(str(d), onerror=force_remove) except PermissionError as e: warnings.warn(Warnings.W091.format(dir=d, msg=e)) From df4c069a132848bc24a227ec521ecb6a9054b227 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 23 Feb 2023 11:36:50 +0100 Subject: [PATCH 13/32] Remove backoff from .vector to .tensor (#12292) --- spacy/tokens/doc.pyx | 3 --- spacy/tokens/span.pyx | 2 -- spacy/tokens/token.pyx | 4 ---- website/docs/usage/101/_vectors-similarity.mdx | 15 +++++++++------ 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f4836dd14..e4adb9d28 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -657,9 +657,6 @@ cdef class Doc: elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector else: return xp.zeros((self.vocab.vectors_length,), dtype="float32") diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 25dbfecdf..8fcf5ad83 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -532,8 +532,6 @@ cdef class Span: return self.doc.user_span_hooks["has_vector"](self) elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) - elif self.doc.tensor.size > 0: - return True else: return False diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 64c707acd..74f812af7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -389,8 +389,6 @@ cdef class Token: """ if "has_vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["has_vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return True return self.vocab.has_vector(self.c.lex.orth) @property @@ -404,8 +402,6 @@ cdef class Token: """ if "vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return self.doc.tensor[self.i] else: return self.vocab.get_vector(self.c.lex.orth) diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx index 6deab926d..39ee8e48a 100644 --- a/website/docs/usage/101/_vectors-similarity.mdx +++ b/website/docs/usage/101/_vectors-similarity.mdx @@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, To make them compact and fast, spaCy's small [pipeline packages](/models) (all -packages that end in `sm`) **don't ship with word vectors**, and only include -context-sensitive **tensors**. This means you can still use the `similarity()` -methods to compare documents, spans and tokens – but the result won't be as -good, and individual tokens won't have any vectors assigned. So in order to use -_real_ word vectors, you need to download a larger pipeline package: +packages that end in `sm`) **don't ship with word vectors**. In order to use +`similarity()`, you need to download a larger pipeline package that includes +vectors: ```diff - python -m spacy download en_core_web_sm -+ python -m spacy download en_core_web_lg ++ python -m spacy download en_core_web_md ``` +In spaCy v3 and earlier, small pipeline packages supported `similarity()` by +backing off to context-sensitive tensors from the `tok2vec` component. These +tensors do not work well for this purpose and this backoff has been removed in +spaCy v4. + Pipeline packages that come with built-in word vectors make them available as From acdd993071319cd7b02a651ac0d046a16e89695e Mon Sep 17 00:00:00 2001 From: Kevin Humphreys Date: Sun, 26 Feb 2023 23:35:08 -0800 Subject: [PATCH 14/32] Matcher performance fix for extension predicates: use shared key function (#12272) * standardize predicate key format * single key function * Make optional args in key function keyword-only --------- Co-authored-by: Adriane Boyd --- spacy/matcher/matcher.pyx | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index ea1b4b66b..b886bd2ec 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -828,6 +828,11 @@ def _get_attr_values(spec, string_store): return attr_values +def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): + # tuple order affects performance + return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) + + # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. @@ -847,7 +852,7 @@ class _FuzzyPredicate: fuzz = self.predicate[len("FUZZY"):] # number after prefix self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy_compare = fuzzy_compare - self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) def __call__(self, Token token): if self.is_extension: @@ -869,7 +874,7 @@ class _RegexPredicate: self.value = re.compile(value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -905,7 +910,7 @@ class _SetPredicate: self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -977,7 +982,7 @@ class _ComparisonPredicate: self.value = value self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -1092,7 +1097,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, if isinstance(value, dict): for type_, cls in predicate_types.items(): if type_ in value: - key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) + key = _predicate_cache_key(attr, type_, value[type_]) if key in seen_predicates: output.append(seen_predicates[key]) else: From 4539fbae176295fd271855cdccb25820eef1ca96 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Feb 2023 09:48:36 +0100 Subject: [PATCH 15/32] Revert "Fix FUZZY operator definition (#12318)" (#12336) This reverts commit daedc45d050b15be8c5422aadff7b652439a562d. The default length depends on the length of the pattern string and was correct for this example. --- website/docs/usage/rule-based-matching.mdx | 4 ++-- website/docs/usage/v3-5.mdx | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index bad049479..628c2953f 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -384,10 +384,10 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum allowed edit distance directly. ```python -# Match lowercase with fuzzy matching (allows 2 edits by default) +# Match lowercase with fuzzy matching (allows 3 edits) pattern = [{"LOWER": {"FUZZY": "definitely"}}] -# Match custom attribute values with fuzzy matching (allows 2 edits by default) +# Match custom attribute values with fuzzy matching (allows 3 edits) pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] # Match with exact Levenshtein edit distance limits (allows 4 edits) diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx index 54c976fe5..3ca64f8a2 100644 --- a/website/docs/usage/v3-5.mdx +++ b/website/docs/usage/v3-5.mdx @@ -70,13 +70,13 @@ distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can be used to specify the exact number of allowed edits. ```python -# Match lowercase with fuzzy matching (allows 2 edits by default) +# Match lowercase with fuzzy matching (allows up to 3 edits) pattern = [{"LOWER": {"FUZZY": "definitely"}}] -# Match custom attribute values with fuzzy matching (allows 2 edits by default) +# Match custom attribute values with fuzzy matching (allows up to 3 edits) pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] -# Match with exact Levenshtein edit distance limits (allows 4 edits) +# Match with exact Levenshtein edit distance limits (allows up to 4 edits) pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}] ``` From e2de188cf1a70f8aa931cb4f9648fb906fece188 Mon Sep 17 00:00:00 2001 From: lise-brinck <104826278+lise-brinck@users.noreply.github.com> Date: Mon, 27 Feb 2023 10:53:45 +0100 Subject: [PATCH 16/32] Bugfix/swedish tokenizer (#12315) * add unittest for explosion#12311 * create punctuation.py for swedish * removed : from infixes in swedish punctuation.py * allow : as infix if succeeding char is uppercase --- spacy/lang/sv/__init__.py | 5 +-- spacy/lang/sv/punctuation.py | 33 +++++++++++++++++++ .../tests/lang/sv/test_prefix_suffix_infix.py | 7 ++++ 3 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 spacy/lang/sv/punctuation.py diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6963e8b79..28e5085a8 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer - - -# Punctuation stolen from Danish -from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES class SwedishDefaults(BaseDefaults): diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py new file mode 100644 index 000000000..67f1bcdc4 --- /dev/null +++ b/spacy/lang/sv/punctuation.py @@ -0,0 +1,33 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] +_suffixes += [r"(?<=[^sSxXzZ])\'"] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index bbb0ff415..0aa495992 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 3 + + +@pytest.mark.issue(12311) +@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"]) +def test_sv_tokenizer_handles_colon(sv_tokenizer, text): + tokens = sv_tokenizer(text) + assert len(tokens) == 1 From 071667376a429da5420ba6332005c05a444d3f9f Mon Sep 17 00:00:00 2001 From: TAN Long <71320000+tanloong@users.noreply.github.com> Date: Tue, 28 Feb 2023 21:36:33 +0800 Subject: [PATCH 17/32] Add new REL_OPs: `>+`, `>-`, `<+`, and `<-` (#12334) * Add immediate left/right child/parent dependency relations * Add tests for new REL_OPs: `>+`, `>-`, `<+`, and `<-`. --------- Co-authored-by: Tan Long --- spacy/matcher/dependencymatcher.pyx | 26 +++++++++++++++++++ .../tests/matcher/test_dependency_matcher.py | 16 ++++++++++++ website/docs/api/dependencymatcher.mdx | 4 +++ website/docs/usage/rule-based-matching.mdx | 8 ++++++ 4 files changed, 54 insertions(+) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 74c2d002f..adf96702b 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -82,8 +82,12 @@ cdef class DependencyMatcher: "$-": self._imm_left_sib, "$++": self._right_sib, "$--": self._left_sib, + ">+": self._imm_right_child, + ">-": self._imm_left_child, ">++": self._right_child, ">--": self._left_child, + "<+": self._imm_right_parent, + "<-": self._imm_left_parent, "<++": self._right_parent, "<--": self._left_parent, } @@ -427,12 +431,34 @@ cdef class DependencyMatcher: def _left_sib(self, doc, node): return [doc[child.i] for child in doc[node].head.children if child.i < node] + def _imm_right_child(self, doc, node): + for child in doc[node].children: + if child.i == node + 1: + return [doc[child.i]] + return [] + + def _imm_left_child(self, doc, node): + for child in doc[node].children: + if child.i == node - 1: + return [doc[child.i]] + return [] + def _right_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i > node] def _left_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i < node] + def _imm_right_parent(self, doc, node): + if doc[node].head.i == node + 1: + return [doc[node].head] + return [] + + def _imm_left_parent(self, doc, node): + if doc[node].head.i == node - 1: + return [doc[node].head] + return [] + def _right_parent(self, doc, node): if doc[node].head.i > node: return [doc[node].head] diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index b4e19d69d..200384320 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): ("the", "brown", "$--", 0), ("brown", "the", "$--", 1), ("brown", "brown", "$--", 0), + ("over", "jumped", "<+", 0), + ("quick", "fox", "<+", 0), + ("the", "quick", "<+", 0), + ("brown", "fox", "<+", 1), ("quick", "fox", "<++", 1), ("quick", "over", "<++", 0), ("over", "jumped", "<++", 0), ("the", "fox", "<++", 2), + ("brown", "fox", "<-", 0), + ("fox", "over", "<-", 0), + ("the", "over", "<-", 0), + ("over", "jumped", "<-", 1), ("brown", "fox", "<--", 0), ("fox", "jumped", "<--", 0), ("fox", "over", "<--", 1), + ("fox", "brown", ">+", 0), + ("over", "fox", ">+", 0), + ("over", "the", ">+", 0), + ("jumped", "over", ">+", 1), ("jumped", "over", ">++", 1), ("fox", "lazy", ">++", 0), ("over", "the", ">++", 0), + ("jumped", "over", ">-", 0), + ("fox", "quick", ">-", 0), + ("brown", "quick", ">-", 0), + ("fox", "brown", ">-", 1), ("brown", "fox", ">--", 0), ("fox", "brown", ">--", 1), ("jumped", "fox", ">--", 1), diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 390034a6c..cad5185f7 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -82,8 +82,12 @@ come directly from | `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | | `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | | `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | | `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | | `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 628c2953f..6a11ac8bd 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1110,6 +1110,14 @@ come directly from | `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} From 8f058e39bd95da1f14d0071452b4d58103014dc7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Feb 2023 16:36:03 +0100 Subject: [PATCH 18/32] Fix error message for displacy auto_select_port (#12343) --- spacy/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index d143e341c..ab013f3eb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -967,7 +967,7 @@ class Errors(metaclass=ErrorsWithCodes): E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " - "or use `auto_switch_port=True` to pick an available port automatically.") + "or use `auto_select_port=True` to pick an available port automatically.") # Deprecated model shortcuts, only used in errors and warnings From 33864f1d07cba3291aaa51a20eb9482d7d1ee734 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 1 Mar 2023 10:46:13 +0100 Subject: [PATCH 19/32] Add new tags in docs for #12334 (#12348) --- website/docs/api/dependencymatcher.mdx | 44 +++++++++++----------- website/docs/usage/rule-based-matching.mdx | 44 +++++++++++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index cad5185f7..14e0916d1 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 6a11ac8bd..55c043015 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} From efbc3d37b36fe1df14b23a746275cdbe19163e9b Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 1 Mar 2023 11:01:35 +0100 Subject: [PATCH 20/32] Update docs w.r.t. spacy.CandidateBatchGenerator.v1. (#12350) --- website/docs/api/architectures.mdx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 2a1bc4380..966b5830a 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -924,6 +924,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. +### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of +[`Span`](/api/span) objects denoting named entities, and returns a list of +plausible [`Candidate`](/api/kb/#candidate) objects per specified +[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a +mention to find its potential aliases in the `KnowledgeBase`. Note that this +function is case-dependent. + ## Coreference {id="coref-architectures",tag="experimental"} A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to From 74cae47bf65d99dbe50b0fe95f04141779c8005b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 1 Mar 2023 12:06:07 +0100 Subject: [PATCH 21/32] rely on is_empty property instead of __len__ (#12347) --- spacy/errors.py | 3 +-- spacy/kb/kb_in_memory.pyx | 3 +++ spacy/pipeline/entity_linker.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 3 +++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index ab013f3eb..2c8b98aad 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes): E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E139 = ("Knowledge base for component '{name}' is empty. Use the methods " - "`kb.add_entity` and `kb.add_alias` to add entries.") + E139 = ("Knowledge base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index edba523cf..2a74d047b 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + def is_empty(self): + return len(self) == 0 + def __len__(self): return self.get_size_entities() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 62845287b..a11964117 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -250,7 +250,7 @@ class EntityLinker(TrainablePipe): # Raise an error if the knowledge base is not initialized. if self.kb is None: raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: + if hasattr(self.kb, "is_empty") and self.kb.is_empty(): raise ValueError(Errors.E139.format(name=self.name)) def initialize( diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 99f164f15..2a6258386 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -353,6 +353,9 @@ def test_kb_default(nlp): """Test that the default (empty) KB is loaded upon construction""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError, match="E139"): + # this raises an error because the KB is empty + entity_linker.validate_kb() assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 # 64 is the default value from pipeline.entity_linker From 56aa0cc75fbbfc55d95541392675092cb1e2e782 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 1 Mar 2023 15:38:23 +0100 Subject: [PATCH 22/32] Displacy doc fix (#12352) * more details for color setting * more details for color setting * prettier --- website/docs/api/top-level.mdx | 32 +++++++++++++++--------------- website/docs/usage/visualizers.mdx | 12 +++++------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 9748719d7..d0851a59f 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Description | -| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | -| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | -| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | -| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | -| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | -| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | -| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | -| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | -| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | -| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {id="displacy_options-ent"} diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index 1d3682af4..c372744de 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -58,12 +58,12 @@ arcs. -| Argument | Description | -| --------- | ----------------------------------------------------------------------------------------- | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| Argument | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options). From da75896ef5454af866744497eebf465b2eb8eefa Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 1 Mar 2023 16:00:02 +0100 Subject: [PATCH 23/32] Return Tuple[Span] for all Doc/Span attrs that provide spans (#12288) * Return Tuple[Span] for all Doc/Span attrs that provide spans * Update Span types --- spacy/tokens/doc.pyi | 4 ++-- spacy/tokens/doc.pyx | 23 +++++++++++------------ spacy/tokens/span.pyi | 4 +++- spacy/tokens/span.pyx | 26 +++++++++++++++----------- website/docs/api/doc.mdx | 23 +++++++++++------------ website/docs/api/span.mdx | 33 ++++++++++++++++----------------- 6 files changed, 58 insertions(+), 55 deletions(-) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 6ff61c05d..48bc21c27 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -131,9 +131,9 @@ class Doc: default: str = ..., ) -> None: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property - def sents(self) -> Iterator[Span]: ... + def sents(self) -> Tuple[Span]: ... @property def lang(self) -> int: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e4adb9d28..0ea2c39ab 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -703,10 +703,10 @@ cdef class Doc: return self.text property ents: - """The named entities in the document. Returns a tuple of named entity + """The named entities in the document. Returns a list of named entity `Span` objects, if the entity recognizer has been applied. - RETURNS (tuple): Entities in the document, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity. DOCS: https://spacy.io/api/doc#ents """ @@ -864,7 +864,7 @@ cdef class Doc: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the document. + RETURNS (Tuple[Span]): Noun chunks in the document. DOCS: https://spacy.io/api/doc#noun_chunks """ @@ -873,36 +873,35 @@ cdef class Doc: # Accumulate the result before beginning to iterate over it. This # prevents the tokenization from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenization changing, so it's okay once we have the Span - # objects. See Issue #375. + # during the iteration. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) - for span in spans: - yield span + return tuple(spans) @property def sents(self): """Iterate over the sentences in the document. Yields sentence `Span` objects. Sentence spans have no label. - YIELDS (Span): Sentences in the document. + RETURNS (Tuple[Span]): Sentences in the document. DOCS: https://spacy.io/api/doc#sents """ if not self.has_annotation("SENT_START"): raise ValueError(Errors.E030) if "sents" in self.user_hooks: - yield from self.user_hooks["sents"](self) + return tuple(self.user_hooks["sents"](self)) else: start = 0 + spans = [] for i in range(1, self.length): if self.c[i].sent_start == 1: - yield Span(self, start, i) + spans.append(Span(self, start, i)) start = i if start != self.length: - yield Span(self, start, self.length) + spans.append(Span(self, start, self.length)) + return tuple(spans) @property def lang(self): diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 88cb90a17..e5031fea9 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -74,6 +74,8 @@ class Span: @property def ents(self) -> Tuple[Span]: ... @property + def sents(self) -> Tuple[Span]: ... + @property def has_vector(self) -> bool: ... @property def vector(self) -> Floats1d: ... @@ -86,7 +88,7 @@ class Span: @property def text_with_ws(self) -> str: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property def root(self) -> Token: ... def char_span( diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 8fcf5ad83..75f7db7ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -461,20 +461,21 @@ cdef class Span: """Obtain the sentences that contain this span. If the given span crosses sentence boundaries, return all sentences it is a part of. - RETURNS (Iterable[Span]): All sentences that the span is a part of. + RETURNS (Tuple[Span]): All sentences that the span is a part of. - DOCS: https://spacy.io/api/span#sents + DOCS: https://spacy.io/api/span#sents """ cdef int start cdef int i if "sents" in self.doc.user_span_hooks: - yield from self.doc.user_span_hooks["sents"](self) - elif "sents" in self.doc.user_hooks: + return tuple(self.doc.user_span_hooks["sents"](self)) + spans = [] + if "sents" in self.doc.user_hooks: for sentence in self.doc.user_hooks["sents"](self.doc): if sentence.end > self.start: if sentence.start < self.end or sentence.start == self.start == self.end: - yield sentence + spans.append(sentence) else: break else: @@ -489,12 +490,13 @@ cdef class Span: # Now, find all the sentences in the span for i in range(start + 1, self.doc.length): if self.doc.c[i].sent_start == 1: - yield Span(self.doc, start, i) + spans.append(Span(self.doc, start, i)) start = i if start >= self.end: break if start < self.end: - yield Span(self.doc, start, self.end) + spans.append(Span(self.doc, start, self.end)) + return tuple(spans) @property @@ -502,7 +504,7 @@ cdef class Span: """The named entities that fall completely within the span. Returns a tuple of `Span` objects. - RETURNS (tuple): Entities in the span, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity. DOCS: https://spacy.io/api/span#ents """ @@ -517,7 +519,7 @@ cdef class Span: ents.append(ent) else: break - return ents + return tuple(ents) @property def has_vector(self): @@ -613,13 +615,15 @@ cdef class Span: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the span. + RETURNS (Tuple[Span]): Noun chunks in the span. DOCS: https://spacy.io/api/span#noun_chunks """ + spans = [] for span in self.doc.noun_chunks: if span.start >= self.start and span.end <= self.end: - yield span + spans.append(span) + return tuple(spans) @property def root(self): diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index 96fe2c35a..fca056ed0 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer). ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the document. Yields base noun-phrase -`Span` objects, if the document has been syntactically parsed. A base noun -phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be -nested within it – so no NP-level coordination, no prepositional phrases, and no -relative clauses. +Returns a tuple of the base noun phrases in the doc, if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. To customize the noun chunk iterator in a loaded pipeline, modify [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` @@ -675,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised. > assert chunks[1].text == "another phrase" > ``` -| Name | Description | -| ---------- | ------------------------------------- | -| **YIELDS** | Noun chunks in the document. ~~Span~~ | +| Name | Description | +| ----------- | -------------------------------------------- | +| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ | ## Doc.sents {id="sents",tag="property",model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. +Returns a tuple of the sentences in the document. Sentence spans have no label. This property is only available when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the @@ -697,9 +696,9 @@ will raise an error otherwise. > assert [s.root.text for s in sents] == ["is", "'s"] > ``` -| Name | Description | -| ---------- | ----------------------------------- | -| **YIELDS** | Sentences in the document. ~~Span~~ | +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ | ## Doc.has_vector {id="has_vector",tag="property",model="vectors"} diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 832501d37..e1ada3b45 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -275,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of > assert ents[0].text == "Mr. Best" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------- | -| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------ | +| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ | ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` -objects, if the document has been syntactically parsed. A base noun phrase, or -"NP chunk", is a noun phrase that does not permit other NPs to be nested within -it – so no NP-level coordination, no prepositional phrases, and no relative -clauses. +Returns a tuple of the base noun phrases in the span if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) has not been implemeted for the given language, a `NotImplementedError` is @@ -301,9 +300,9 @@ raised. > assert chunks[0].text == "another phrase" > ``` -| Name | Description | -| ---------- | --------------------------------- | -| **YIELDS** | Noun chunks in the span. ~~Span~~ | +| Name | Description | +| ----------- | ---------------------------------------- | +| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ | ## Span.as_doc {id="as_doc",tag="method"} @@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)] ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} -Returns a generator over the sentences the span belongs to. This property is -only available when [sentence boundaries](/usage/linguistic-features#sbd) have -been set on the document by the `parser`, `senter`, `sentencizer` or some custom +Returns a tuple of the sentences the span belongs to. This property is only +available when [sentence boundaries](/usage/linguistic-features#sbd) have been +set on the document by the `parser`, `senter`, `sentencizer` or some custom function. It will raise an error otherwise. If the span happens to cross sentence boundaries, all sentences the span @@ -541,9 +540,9 @@ overlaps with will be returned. > assert len(span.sents) == 2 > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------- | -| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ | ## Attributes {id="attributes"} From 6aa6b86d496c8d9271f42c077a79f9bfb88687ac Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 1 Mar 2023 16:02:55 +0100 Subject: [PATCH 24/32] Make generation of empty `KnowledgeBase` instances configurable in `EntityLinker` (#12320) * Make empty_kb() configurable. * Format. * Update docs. * Be more specific in KB serialization test. * Update KB serialization tests. Update docs. * Remove doc update for batched candidate generation. * Fix serialization of subclassed KB in tests. * Format. * Update docstring. * Update docstring. * Switch from pickle to json for custom field serialization. --- spacy/ml/models/entity_linker.py | 8 +++ spacy/pipeline/entity_linker.py | 11 +++- spacy/tests/serialize/test_serialize_kb.py | 71 +++++++++++++++++++--- website/docs/api/architectures.mdx | 10 ++- website/docs/api/entitylinker.mdx | 28 +++++---- 5 files changed, 101 insertions(+), 27 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..7332ca199 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a11964117..f2dae0529 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, @@ -80,6 +81,7 @@ def make_entity_linker( get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -101,6 +103,7 @@ def make_entity_linker( get_candidates_batch ( Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -135,6 +138,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..f9d2e226b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 966b5830a..268c04a07 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index bafb2f2da..d84dd3ca9 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py From 0bbc620dd80007ac22d8bf1c9f6202eebc748596 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 14:48:57 +0100 Subject: [PATCH 25/32] Partially work around pending deprecation of pkg_resources (#12368) * Handle deprecation of pkg_resources * Replace `pkg_resources` with `importlib_metadata` for `spacy info --url` * Remove requirements check from `spacy project` given the lack of alternatives * Fix installed model URL method and CI test * Fix types/handling, simplify catch-all return * Move imports instead of disabling requirements check * Format * Reenable test with ignored deprecation warning * Fix except * Fix return --- .github/azure-steps.yml | 5 +++++ spacy/cli/info.py | 17 ++++++++--------- spacy/cli/project/run.py | 2 +- spacy/tests/test_cli.py | 4 +++- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index ed69f611b..b2ccf3d81 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -59,6 +59,11 @@ steps: displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') + - script: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + displayName: 'Test download_url in info CLI' + condition: eq(variables['python_version'], '3.8') + - script: | python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" displayName: 'Test no warnings on load (#11713)' diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..d82bf3fbc 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer @@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version from .. import util from .. import about +from ..compat import importlib_metadata @app.command("info") @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib_metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index dc7ce46fe..752750d33 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") From 41b3a0d932aafb4db9db02ae2e03b560305e0d53 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Tue, 7 Mar 2023 13:10:45 +0100 Subject: [PATCH 26/32] Drop support for EntityLinker_v1. (#12377) --- spacy/errors.py | 1 + spacy/pipeline/entity_linker.py | 23 ++-------------------- spacy/tests/pipeline/test_entity_linker.py | 7 +------ 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5049100d8..390de126e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -960,6 +960,7 @@ class Errors(metaclass=ErrorsWithCodes): E4003 = ("Training examples for distillation must have the exact same tokens in the " "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index cd13a4b21..6a187b6c3 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -117,28 +117,9 @@ def make_entity_linker( prediction is discarded. If None, predictions are not filtered by any threshold. save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - try: - from spacy_legacy.components.entity_linker import EntityLinker_v1 - except: - raise ImportError( - "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." - ) - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) + raise ValueError(Errors.E4005) + return EntityLinker( nlp.vocab, model, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index ed84ce674..87cacfc9d 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -993,13 +993,11 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ - ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) # fmt: on def test_legacy_architectures(name, config): - from spacy_legacy.components.entity_linker import EntityLinker_v1 # Ensure that the legacy architectures still work vector_length = 3 @@ -1022,10 +1020,7 @@ def test_legacy_architectures(name, config): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - if config["@architectures"] == "spacy.EntityLinker.v1": - assert isinstance(entity_linker, EntityLinker_v1) - else: - assert isinstance(entity_linker, EntityLinker) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) From 520279ff7c9af199928e2a727999162cb79c38a3 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Thu, 9 Mar 2023 09:37:19 +0100 Subject: [PATCH 27/32] `Tok2Vec`: Add `distill` method (#12108) * `Tok2Vec`: Add `distill` method * `Tok2Vec`: Refactor `update` * Add `Tok2Vec.distill` test * Update `distill` signature to accept `Example`s instead of separate teacher and student docs * Add docs * Remove docstring * Update test * Remove `update` calls from test * Update `Tok2Vec.distill` docstring --- spacy/pipeline/tok2vec.py | 125 ++++++++++++++++++++------- spacy/tests/pipeline/test_tok2vec.py | 83 ++++++++++++++++++ website/docs/api/tok2vec.mdx | 37 ++++++++ 3 files changed, 213 insertions(+), 32 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index c742aaeaa..d9639f8d5 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,6 @@ -from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any +from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from thinc.types import Floats2d from itertools import islice from .trainable_pipe import TrainablePipe @@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#update """ - if losses is None: - losses = {} validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - tokvecs, bp_tokvecs = self.model.begin_update(docs) - d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - losses.setdefault(self.name, 0.0) - - def accumulate_gradient(one_d_tokvecs): - """Accumulate tok2vec loss and gradient. This is passed as a callback - to all but the last listener. Only the last one does the backprop. - """ - nonlocal d_tokvecs - for i in range(len(one_d_tokvecs)): - d_tokvecs[i] += one_d_tokvecs[i] - losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) - return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - - def backprop(one_d_tokvecs): - """Callback to actually do the backprop. Passed to last listener.""" - accumulate_gradient(one_d_tokvecs) - d_docs = bp_tokvecs(d_tokvecs) - if sgd is not None: - self.finish_update(sgd) - return d_docs - - batch_id = Tok2VecListener.get_batch_id(docs) - for listener in self.listeners[:-1]: - listener.receive(batch_id, tokvecs, accumulate_gradient) - if self.listeners: - self.listeners[-1].receive(batch_id, tokvecs, backprop) - return losses + return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses) def get_loss(self, examples, scores) -> None: pass @@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe): def add_label(self, label): raise NotImplementedError + def distill( + self, + teacher_pipe: Optional["TrainablePipe"], + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Performs an update of the student pipe's model using the + student's distillation examples and sets the annotations + of the teacher's distillation examples using the teacher pipe. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use + for prediction. + examples (Iterable[Example]): Distillation examples. The reference (teacher) + and predicted (student) docs must have the same number of tokens and the + same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/tok2vec#distill + """ + # By default we require a teacher pipe, but there are downstream + # implementations that don't require a pipe. + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + teacher_docs = [eg.reference for eg in examples] + student_docs = [eg.predicted for eg in examples] + teacher_preds = teacher_pipe.predict(teacher_docs) + teacher_pipe.set_annotations(teacher_docs, teacher_preds) + return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses) + + def _update_with_docs( + self, + docs: Iterable[Doc], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + set_dropout_rate(self.model, drop) + + tokvecs, accumulate_gradient, backprop = self._create_backprops( + docs, losses, sgd=sgd + ) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners[:-1]: + listener.receive(batch_id, tokvecs, accumulate_gradient) + if self.listeners: + self.listeners[-1].receive(batch_id, tokvecs, backprop) + return losses + + def _create_backprops( + self, + docs: Iterable[Doc], + losses: Dict[str, float], + *, + sgd: Optional[Optimizer] = None, + ) -> Tuple[Floats2d, Callable, Callable]: + tokvecs, bp_tokvecs = self.model.begin_update(docs) + d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def accumulate_gradient(one_d_tokvecs): + """Accumulate tok2vec loss and gradient. This is passed as a callback + to all but the last listener. Only the last one does the backprop. + """ + nonlocal d_tokvecs + for i in range(len(one_d_tokvecs)): + d_tokvecs[i] += one_d_tokvecs[i] + losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def backprop(one_d_tokvecs): + """Callback to actually do the backprop. Passed to last listener.""" + accumulate_gradient(one_d_tokvecs) + d_docs = bp_tokvecs(d_tokvecs) + if sgd is not None: + self.finish_update(sgd) + return d_docs + + return tokvecs, accumulate_gradient, backprop + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index ee62b1ab4..6929b76fa 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat(): assert cats1["imperative"] < 0.9 assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] + + +cfg_string_distillation = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v2" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_distillation_teacher_annotations(): + orig_config = Config().from_str(cfg_string_distillation) + teacher_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + student_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + + train_examples_teacher = [] + train_examples_student = [] + for t in TRAIN_DATA: + train_examples_teacher.append( + Example.from_dict(teacher_nlp.make_doc(t[0]), t[1]) + ) + train_examples_student.append( + Example.from_dict(student_nlp.make_doc(t[0]), t[1]) + ) + + optimizer = teacher_nlp.initialize(lambda: train_examples_teacher) + student_nlp.initialize(lambda: train_examples_student) + + # Since Language.distill creates a copy of the examples to use as + # its internal teacher/student docs, we'll need to monkey-patch the + # tok2vec pipe's distill method. + student_tok2vec = student_nlp.get_pipe("tok2vec") + student_tok2vec._old_distill = student_tok2vec.distill + + def tok2vec_distill_wrapper( + self, + teacher_pipe, + examples, + **kwargs, + ): + assert all(not eg.reference.tensor.any() for eg in examples) + out = self._old_distill(teacher_pipe, examples, **kwargs) + assert all(eg.reference.tensor.any() for eg in examples) + return out + + student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) + student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx index a1bb1265e..8b6d2380b 100644 --- a/website/docs/api/tok2vec.mdx +++ b/website/docs/api/tok2vec.mdx @@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"} + +Performs an update of the student pipe's model using the student's distillation +examples and sets the annotations of the teacher's distillation examples using +the teacher pipe. + +Unlike other trainable pipes, the student pipe doesn't directly learn its +representations from the teacher. However, since downstream pipes that do +perform distillation expect the tok2vec annotations to be present on the +correct distillation examples, we need to ensure that they are set beforehand. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("tok2vec") +> student_pipe = student.add_pipe("tok2vec") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ | +| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Tok2Vec.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood From 6ae7618418d1a2514d55bff041fb196957844de6 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 19 Mar 2023 23:41:20 +0100 Subject: [PATCH 28/32] Clean up Vocab constructor (#12290) * Clean up Vocab constructor * Change effective type of `strings` from `Iterable[str]` to `Optional[StringStore]` * Don't automatically add strings to vocab * Change default values to `None` * Remove `**deprecated_kwargs` * Format --- spacy/strings.pyi | 2 +- spacy/tests/pipeline/test_pipe_methods.py | 3 ++- .../serialize/test_serialize_vocab_strings.py | 27 +++++++++++-------- spacy/tests/vocab_vectors/test_lexeme.py | 2 +- spacy/vocab.pyi | 2 +- spacy/vocab.pyx | 18 +++++++------ website/docs/api/vocab.mdx | 5 ++-- 7 files changed, 34 insertions(+), 25 deletions(-) diff --git a/spacy/strings.pyi b/spacy/strings.pyi index d9509ff57..38dee7034 100644 --- a/spacy/strings.pyi +++ b/spacy/strings.pyi @@ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo from pathlib import Path class StringStore: - def __init__(self, strings: Optional[Iterable[str]]) -> None: ... + def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ... @overload def __getitem__(self, string_or_hash: str) -> int: ... @overload diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 9b9786f04..39611a742 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -9,6 +9,7 @@ from spacy.lang.en import English from spacy.lang.en.syntax_iterators import noun_chunks from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.strings import StringStore from spacy.tokens import Doc from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir @@ -131,7 +132,7 @@ def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans # fmt: off words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] - vocab = Vocab(strings=words) + vocab = Vocab(strings=StringStore(words)) deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index fd80c3d8e..f6356ac9e 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -13,8 +13,11 @@ from spacy.vocab import Vocab from ..util import make_tempdir -test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] -test_strings_attrs = [(["rats", "are", "cute"], "Hello")] +test_strings = [ + (StringStore(), StringStore()), + (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])), +] +test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")] @pytest.mark.issue(599) @@ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab2 = Vocab(strings=strings2) vocab1_b = vocab1.to_bytes() vocab2_b = vocab2.to_bytes() - if strings1 == strings2: + if strings1.to_bytes() == strings2.to_bytes(): assert vocab1_b == vocab2_b else: assert vocab1_b != vocab2_b @@ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() - vocab1[strings[0]].norm_ = lex_attr - assert vocab1[strings[0]].norm_ == lex_attr - assert vocab2[strings[0]].norm_ != lex_attr + s = next(iter(vocab1.strings)) + vocab1[s].norm_ = lex_attr + assert vocab1[s].norm_ == lex_attr + assert vocab2[s].norm_ != lex_attr vocab2 = vocab2.from_bytes(vocab1.to_bytes()) - assert vocab2[strings[0]].norm_ == lex_attr + assert vocab2[s].norm_ == lex_attr @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() - vocab1[strings[0]].norm_ = lex_attr - assert vocab1[strings[0]].norm_ == lex_attr - assert vocab2[strings[0]].norm_ != lex_attr + s = next(iter(vocab1.strings)) + vocab1[s].norm_ = lex_attr + assert vocab1[s].norm_ == lex_attr + assert vocab2[s].norm_ != lex_attr with make_tempdir() as d: file_path = d / "vocab" vocab1.to_disk(file_path) vocab2 = vocab2.from_disk(file_path) - assert vocab2[strings[0]].norm_ == lex_attr + assert vocab2[s].norm_ == lex_attr @pytest.mark.parametrize("strings1,strings2", test_strings) diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d91f41db3..cd7f954ae 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2): @pytest.mark.issue(600) def test_issue600(): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + vocab = Vocab() doc = Doc(vocab, words=["hello"]) doc[0].tag_ = "NN" diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 871044fff..e4a88bfd8 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -26,7 +26,7 @@ class Vocab: def __init__( self, lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ..., - strings: Optional[Union[List[str], StringStore]] = ..., + strings: Optional[StringStore] = ..., lookups: Optional[Lookups] = ..., oov_prob: float = ..., writing_system: Dict[str, Any] = ..., diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f3c3595ef..0d3c9c883 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,9 +49,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ - def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, - oov_prob=-20., writing_system={}, get_noun_chunks=None, - **deprecated_kwargs): + def __init__(self, lex_attr_getters=None, strings=None, lookups=None, + oov_prob=-20., writing_system=None, get_noun_chunks=None): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -69,16 +68,19 @@ cdef class Vocab: self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() - self.strings = StringStore() self.length = 0 - if strings: - for string in strings: - _ = self[string] + if strings is None: + self.strings = StringStore() + else: + self.strings = strings self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) self.vectors = Vectors(strings=self.strings) self.lookups = lookups - self.writing_system = writing_system + if writing_system is None: + self.writing_system = {} + else: + self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks property vectors: diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 3faf1f1a0..304040f9c 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -17,14 +17,15 @@ Create the vocabulary. > #### Example > > ```python +> from spacy.strings import StringStore > from spacy.vocab import Vocab -> vocab = Vocab(strings=["hello", "world"]) +> vocab = Vocab(strings=StringStore(["hello", "world"])) > ``` | Name | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | -| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | +| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | From 9340eb8ad2a7525096c902112c7cf1a21d145df2 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 20 Mar 2023 00:34:35 +0100 Subject: [PATCH 29/32] Introduce hierarchy for EL `Candidate` objects (#12341) * Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Updated error code. * Simplify interface for int/str representations. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem * Rename 'alias' to 'mention'. * Port Candidate and InMemoryCandidate to Cython. * Remove redundant entry in setup.py. * Add abstract class check. * Drop storing mention. * Update spacy/kb/candidate.pxd Co-authored-by: Sofie Van Landeghem * Fix entity_id refactoring problems in docstrings. * Drop unused InMemoryCandidate._entity_hash. * Update docstrings. * Move attributes out of Candidate. * Partially fix alias/mention terminology usage. Convert Candidate to interface. * Remove prior_prob from supported properties in Candidate. Introduce KnowledgeBase.supports_prior_probs(). * Update docstrings related to prior_prob. * Update alias/mention usage in doc(strings). * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem * Mention -> alias renaming. Drop Candidate.mentions(). Drop InMemoryLookupKB.get_alias_candidates() from docs. * Update docstrings. * Fix InMemoryCandidate attribute names. * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem * Update spacy/ml/models/entity_linker.py Co-authored-by: Sofie Van Landeghem * Update W401 test. * Update spacy/errors.py Co-authored-by: Sofie Van Landeghem * Update spacy/kb/kb.pyx Co-authored-by: Sofie Van Landeghem * Use Candidate output type for toy generators in the test suite to mimick best practices * fix docs * fix import --------- Co-authored-by: Sofie Van Landeghem --- spacy/errors.py | 8 +- spacy/kb/__init__.py | 5 +- spacy/kb/candidate.pxd | 19 ++-- spacy/kb/candidate.pyx | 120 ++++++++++++--------- spacy/kb/kb.pyx | 21 ++-- spacy/kb/kb_in_memory.pyx | 31 +++--- spacy/ml/models/entity_linker.py | 24 ++++- spacy/pipeline/entity_linker.py | 22 ++-- spacy/tests/pipeline/test_entity_linker.py | 49 +++++---- spacy/tests/serialize/test_serialize_kb.py | 12 ++- website/docs/api/inmemorylookupkb.mdx | 40 +++---- website/docs/api/kb.mdx | 51 ++++----- 12 files changed, 223 insertions(+), 179 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 390de126e..e1f7e7400 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -82,7 +82,7 @@ class Warnings(metaclass=ErrorsWithCodes): "ignoring the duplicate entry.") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") - W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " + W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " "the Knowledge Base.") W026 = ("Unable to set all sentence boundaries from dependency parses. If " "you are constructing a parse tree incrementally by setting " @@ -209,7 +209,11 @@ class Warnings(metaclass=ErrorsWithCodes): "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + # v4 warning strings W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") + W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " + "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " + "to return `True` in `.supports_prior_probs`.") class Errors(metaclass=ErrorsWithCodes): @@ -961,6 +965,8 @@ class Errors(metaclass=ErrorsWithCodes): "reference and predicted docs.") E4004 = ("Backprop is not supported when is_train is not set.") E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") + E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") + RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index 1d70a9b34..ff0e209e3 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -1,3 +1,6 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB -from .candidate import Candidate, get_candidates, get_candidates_batch +from .candidate import Candidate, InMemoryCandidate + + +__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd index 942ce9dd0..f21f423e4 100644 --- a/spacy/kb/candidate.pxd +++ b/spacy/kb/candidate.pxd @@ -1,12 +1,15 @@ -from .kb cimport KnowledgeBase from libcpp.vector cimport vector +from .kb_in_memory cimport InMemoryLookupKB from ..typedefs cimport hash_t -# Object used by the Entity Linker that summarizes one entity-alias candidate combination. cdef class Candidate: - cdef readonly KnowledgeBase kb - cdef hash_t entity_hash - cdef float entity_freq - cdef vector[float] entity_vector - cdef hash_t alias_hash - cdef float prior_prob + pass + + +cdef class InMemoryCandidate(Candidate): + cdef readonly hash_t _entity_hash + cdef readonly hash_t _alias_hash + cpdef vector[float] _entity_vector + cdef float _prior_prob + cdef readonly InMemoryLookupKB _kb + cdef float _entity_freq diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx index c89efeb03..3d8da4b95 100644 --- a/spacy/kb/candidate.pyx +++ b/spacy/kb/candidate.pyx @@ -1,74 +1,96 @@ # cython: infer_types=True, profile=True -from typing import Iterable -from .kb cimport KnowledgeBase -from ..tokens import Span +from .kb_in_memory cimport InMemoryLookupKB +from ..errors import Errors cdef class Candidate: - """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved - to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking + """A `Candidate` object refers to a textual mention that may or may not be resolved + to a specific entity from a Knowledge Base. This will be used as input for the entity linking algorithm which will disambiguate the various candidates to the correct one. - Each candidate (alias, entity) pair is assigned a certain prior probability. + Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, + is assigned a certain prior probability. DOCS: https://spacy.io/api/kb/#candidate-init """ - def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): - self.kb = kb - self.entity_hash = entity_hash - self.entity_freq = entity_freq - self.entity_vector = entity_vector - self.alias_hash = alias_hash - self.prior_prob = prior_prob + def __init__(self): + # Make sure abstract Candidate is not instantiated. + if self.__class__ == Candidate: + raise TypeError( + Errors.E1046.format(cls_name=self.__class__.__name__) + ) @property - def entity(self) -> int: - """RETURNS (uint64): hash of the entity's KB ID/name""" - return self.entity_hash + def entity_id(self) -> int: + """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, + otherwise the hash of the entity ID string).""" + raise NotImplementedError @property - def entity_(self) -> str: - """RETURNS (str): ID/name of this entity in the KB""" - return self.kb.vocab.strings[self.entity_hash] + def entity_id_(self) -> str: + """RETURNS (str): String representation of entity ID.""" + raise NotImplementedError @property - def alias(self) -> int: - """RETURNS (uint64): hash of the alias""" - return self.alias_hash + def entity_vector(self) -> vector[float]: + """RETURNS (vector[float]): Entity vector.""" + raise NotImplementedError + + +cdef class InMemoryCandidate(Candidate): + """Candidate for InMemoryLookupKB.""" + + def __init__( + self, + kb: InMemoryLookupKB, + entity_hash: int, + alias_hash: int, + entity_vector: vector[float], + prior_prob: float, + entity_freq: float + ): + """ + kb (InMemoryLookupKB]): InMemoryLookupKB instance. + entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). + entity_freq (int): Entity frequency in KB corpus. + entity_vector (List[float]): Entity embedding. + alias_hash (int): Alias hash. + prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of + the context, this alias - which matches one of this entity's aliases - resolves to one this entity. + """ + super().__init__() + + self._entity_hash = entity_hash + self._entity_vector = entity_vector + self._prior_prob = prior_prob + self._kb = kb + self._alias_hash = alias_hash + self._entity_freq = entity_freq @property - def alias_(self) -> str: - """RETURNS (str): ID of the original alias""" - return self.kb.vocab.strings[self.alias_hash] + def entity_id(self) -> int: + return self._entity_hash @property - def entity_freq(self) -> float: - return self.entity_freq - - @property - def entity_vector(self) -> Iterable[float]: - return self.entity_vector + def entity_vector(self) -> vector[float]: + return self._entity_vector @property def prior_prob(self) -> float: - return self.prior_prob + """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to + this entity.""" + return self._prior_prob + @property + def alias(self) -> str: + """RETURNS (str): Alias.""" + return self._kb.vocab.strings[self._alias_hash] -def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: - """ - Return candidate entities for a given mention and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Span): Entity mention for which to identify candidates. - RETURNS (Iterable[Candidate]): Identified candidates. - """ - return kb.get_candidates(mention) + @property + def entity_id_(self) -> str: + return self._kb.vocab.strings[self._entity_hash] - -def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: - """ - Return candidate entities for the given mentions and fetching appropriate entries from the index. - kb (KnowledgeBase): Knowledge base to query. - mention (Iterable[Span]): Entity mentions for which to identify candidates. - RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. - """ - return kb.get_candidates_batch(mentions) + @property + def entity_freq(self) -> float: + """RETURNS (float): Entity frequency in KB corpus.""" + return self._entity_freq diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index ce4bc0138..1cb08f488 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -32,9 +32,10 @@ cdef class KnowledgeBase: def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: """ - Return candidate entities for specified texts. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If no candidate is found for a given text, an empty list is returned. + Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If no candidates are found for a given mention, an empty list is returned. mentions (Iterable[Span]): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ @@ -42,9 +43,10 @@ cdef class KnowledgeBase: def get_candidates(self, mention: Span) -> Iterable[Candidate]: """ - Return candidate entities for specified text. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If the no candidate is found for a given text, an empty list is returned. + Return candidate entities for a specific mention. Each candidate defines at least the entity and the + entity's embedding vector. Depending on the KB implementation, further properties - such as the prior + probability of the specified mention text resolving to that entity - might be included. + If no candidate is found for the given mention, an empty list is returned. mention (Span): Mention for which to get candidates. RETURNS (Iterable[Candidate]): Identified candidates. """ @@ -106,3 +108,10 @@ cdef class KnowledgeBase: raise NotImplementedError( Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) ) + + @property + def supports_prior_probs(self) -> bool: + """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" + raise NotImplementedError( + Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) + ) diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 2a74d047b..c9ced8309 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -18,7 +18,7 @@ from .. import util from ..util import SimpleFrozenList, ensure_path from ..vocab cimport Vocab from .kb cimport KnowledgeBase -from .candidate import Candidate as Candidate +from .candidate import InMemoryCandidate cdef class InMemoryLookupKB(KnowledgeBase): @@ -226,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - def get_candidates(self, mention: Span) -> Iterable[Candidate]: - return self.get_alias_candidates(mention.text) # type: ignore + def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: + return self._get_alias_candidates(mention.text) # type: ignore - def get_alias_candidates(self, str alias) -> Iterable[Candidate]: + def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. @@ -241,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] - return [Candidate(kb=self, - entity_hash=self._entries[entry_index].entity_hash, - entity_freq=self._entries[entry_index].freq, - entity_vector=self._vectors_table[self._entries[entry_index].vector_index], - alias_hash=alias_hash, - prior_prob=prior_prob) - for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) - if entry_index != 0] + return [ + InMemoryCandidate( + kb=self, + entity_hash=self._entries[entry_index].entity_hash, + alias_hash=alias_hash, + entity_vector=self._vectors_table[self._entries[entry_index].vector_index], + prior_prob=prior_prob, + entity_freq=self._entries[entry_index].freq + ) + for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) + if entry_index != 0 + ] def get_vector(self, str entity): cdef hash_t entity_hash = self.vocab.strings[entity] @@ -279,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): return 0.0 + def supports_prior_probs(self) -> bool: + return True + def to_bytes(self, **kwargs): """Serialize the current state to a binary string. """ diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 7332ca199..7fe0b4741 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -6,7 +6,7 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB -from ...kb import Candidate, get_candidates, get_candidates_batch +from ...kb import Candidate from ...vocab import Vocab from ...tokens import Span, Doc from ..extract_spans import extract_spans @@ -117,3 +117,25 @@ def create_candidates_batch() -> Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ]: return get_candidates_batch + + +def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: + """ + Return candidate entities for a given mention and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mention (Span): Entity mention for which to identify candidates. + RETURNS (Iterable[Candidate]): Identified candidates. + """ + return kb.get_candidates(mention) + + +def get_candidates_batch( + kb: KnowledgeBase, mentions: Iterable[Span] +) -> Iterable[Iterable[Candidate]]: + """ + Return candidate entities for the given mentions and fetching appropriate entries from the index. + kb (KnowledgeBase): Knowledge base to query. + mentions (Iterable[Span]): Entity mentions for which to identify candidates. + RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. + """ + return kb.get_candidates_batch(mentions) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6a187b6c3..caced9cfd 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,5 @@ -from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any -from typing import cast +import warnings +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast from numpy import dtype from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from pathlib import Path @@ -10,14 +10,13 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab from ..training import Example, validate_examples, validate_get_examples -from ..errors import Errors +from ..errors import Errors, Warnings from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer @@ -240,6 +239,8 @@ class EntityLinker(TrainablePipe): if candidates_batch_size < 1: raise ValueError(Errors.E1044) + if self.incl_prior and not self.kb.supports_prior_probs: + warnings.warn(Warnings.W401) def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will @@ -522,18 +523,19 @@ class EntityLinker(TrainablePipe): ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) + final_kb_ids.append(candidates[0].entity_id_) self._add_activations( doc_scores=doc_scores, doc_ents=doc_ents, scores=[1.0], - ents=[candidates[0].entity_], + ents=[candidates[0].entity_id], ) else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: + if self.incl_prior and self.kb.supports_prior_probs: + prior_probs = xp.asarray([c.prior_prob for c in candidates]) # type: ignore + else: prior_probs = xp.asarray([0.0 for _ in candidates]) scores = prior_probs # add in similarity from the context @@ -557,7 +559,7 @@ class EntityLinker(TrainablePipe): raise ValueError(Errors.E161) scores = prior_probs + sims - (prior_probs * sims) final_kb_ids.append( - candidates[scores.argmax().item()].entity_ + candidates[scores.argmax().item()].entity_id_ if self.threshold is None or scores.max() >= self.threshold else EntityLinker.NIL @@ -566,7 +568,7 @@ class EntityLinker(TrainablePipe): doc_scores=doc_scores, doc_ents=doc_ents, scores=scores, - ents=[c.entity for c in candidates], + ents=[c.entity_id for c in candidates], ) self._add_doc_activations( docs_scores=docs_scores, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 87cacfc9d..65406a36e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -7,10 +7,10 @@ from thinc.types import Ragged from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase +from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb -from spacy.ml.models.entity_linker import build_span_maker +from spacy.ml.models.entity_linker import build_span_maker, get_candidates from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer @@ -465,16 +465,17 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates + adam_ent_cands = get_candidates(mykb, adam_ent) assert len(get_candidates(mykb, douglas_ent)) == 2 - assert len(get_candidates(mykb, adam_ent)) == 1 + assert len(adam_ent_cands) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" - assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" - assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) - assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) + assert adam_ent_cands[0].entity_id_ == "Q2" + assert adam_ent_cands[0].alias == "adam" + assert_almost_equal(adam_ent_cands[0].entity_freq, 12) + assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) def test_el_pipe_configuration(nlp): @@ -502,7 +503,7 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" def get_lowercased_candidates(kb, span): - return kb.get_alias_candidates(span.text.lower()) + return kb._get_alias_candidates(span.text.lower()) def get_lowercased_candidates_batch(kb, spans): return [get_lowercased_candidates(kb, span) for span in spans] @@ -561,24 +562,22 @@ def test_vocab_serialization(nlp): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) - candidates = mykb.get_alias_candidates("adam") + candidates = mykb._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" + assert candidates[0].alias == "adam" with make_tempdir() as d: mykb.to_disk(d / "kb") kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) kb_new_vocab.from_disk(d / "kb") - candidates = kb_new_vocab.get_alias_candidates("adam") + candidates = kb_new_vocab._get_alias_candidates("adam") assert len(candidates) == 1 - assert candidates[0].entity == q2_hash - assert candidates[0].entity_ == "Q2" - assert candidates[0].alias == adam_hash - assert candidates[0].alias_ == "adam" + assert candidates[0].entity_id == q2_hash + assert candidates[0].entity_id_ == "Q2" + assert candidates[0].alias == "adam" assert kb_new_vocab.get_vector("Q2") == [2] assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) @@ -598,20 +597,20 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_alias_candidates("douglas")) == 2 + assert len(mykb._get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb.get_alias_candidates("douglas")) == 3 + assert len(mykb._get_alias_candidates("douglas")) == 3 @pytest.mark.filterwarnings("ignore:\\[W036") @@ -908,11 +907,11 @@ def test_kb_to_bytes(): assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() - assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( - kb_2.get_alias_candidates("Russ Cochran") + assert len(kb_1._get_alias_candidates("Russ Cochran")) == len( + kb_2._get_alias_candidates("Russ Cochran") ) - assert len(kb_1.get_alias_candidates("Randomness")) == len( - kb_2.get_alias_candidates("Randomness") + assert len(kb_1._get_alias_candidates("Randomness")) == len( + kb_2._get_alias_candidates("Randomness") ) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index f9d2e226b..eb4254d31 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -66,19 +66,21 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted( + kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_ + ) assert len(candidates) == 2 - assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_id_ == "Q007" assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].alias_ == "double07" + assert candidates[0].alias == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_id_ == "Q17" assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].alias_ == "double07" + assert candidates[1].alias == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index c24fe78d6..6fa6cb235 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -10,9 +10,9 @@ version: 3.5 The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and implements all of its methods. It stores all KB data in-memory and generates -[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with -entity names. It's highly optimized for both a low memory footprint and speed of -retrieval. +[`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions +with entity names. It's highly optimized for both a low memory footprint and +speed of retrieval. ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"} @@ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base. ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"} Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb#candidate). Wraps +of type [`InMemoryCandidate`](/api/kb#candidate). Wraps [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). > #### Example @@ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps > candidates = kb.get_candidates(doc[0:2]) > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------- | -| `mention` | The textual mention or alias. ~~Span~~ | -| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------ | +| `mention` | The textual mention or alias. ~~Span~~ | +| **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ | ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} @@ -194,26 +194,10 @@ to you. > candidates = kb.get_candidates((doc[0:2], doc[3:])) > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------------------------- | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | -| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | - -## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"} - -Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb#candidate). - -> #### Example -> -> ```python -> candidates = kb.get_alias_candidates("Douglas") -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------- | -| `alias` | The textual mention or alias. ~~str~~ | -| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------ | +| `mentions` | The textual mentions. ~~Iterable[Span]~~ | +| **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 2b0d4d9d6..9536a3fe3 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -103,23 +103,6 @@ to you. | `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | -## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"} - - - This method is _not_ available from spaCy 3.5 onwards. - - -From spaCy 3.5 on `KnowledgeBase` is an abstract class (with -[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to -allow more flexibility in customizing knowledge bases. Some of its methods were -moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, -one of those being `get_alias_candidates()`. This method is now available as -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). -Note: -[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) -defaults to -[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). - ## KnowledgeBase.get_vector {id="get_vector",tag="method"} Given a certain entity ID, retrieve its pretrained entity vector. @@ -190,25 +173,27 @@ Restore the state of the knowledge base from a given directory. Note that the | `exclude` | List of components to exclude. ~~Iterable[str]~~ | | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ | -## Candidate {id="candidate",tag="class"} +## InMemoryCandidate {id="candidate",tag="class"} -A `Candidate` object refers to a textual mention (alias) that may or may not be -resolved to a specific entity from a `KnowledgeBase`. This will be used as input -for the entity linking algorithm which will disambiguate the various candidates -to the correct one. Each candidate `(alias, entity)` pair is assigned to a -certain prior probability. +An `InMemoryCandidate` object refers to a textual mention (alias) that may or +may not be resolved to a specific entity from a `KnowledgeBase`. This will be +used as input for the entity linking algorithm which will disambiguate the +various candidates to the correct one. Each candidate `(alias, entity)` pair is +assigned to a certain prior probability. -### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"} +### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} -Construct a `Candidate` object. Usually this constructor is not called directly, -but instead these objects are returned by the `get_candidates` method of the -[`entity_linker`](/api/entitylinker) pipe. +Construct an `InMemoryCandidate` object. Usually this constructor is not called +directly, but instead these objects are returned by the `get_candidates` method +of the [`entity_linker`](/api/entitylinker) pipe. -> #### Example +> #### Example```python +> +> from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, +> entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) +> +> ``` > -> ```python -> from spacy.kb import Candidate -> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > ``` | Name | Description | @@ -216,10 +201,10 @@ but instead these objects are returned by the `get_candidates` method of the | `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | | `entity_hash` | The hash of the entity's KB ID. ~~int~~ | | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | -| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | +| `alias_hash` | The hash of the entity alias. ~~int~~ | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | -## Candidate attributes {id="candidate-attributes"} +## InMemoryCandidate attributes {id="candidate-attributes"} | Name | Description | | --------------- | ------------------------------------------------------------------------ | From 3102e2e27a7a095cb695a21c1ef21e6efdce9f8a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 20 Mar 2023 12:25:18 +0100 Subject: [PATCH 30/32] Entity linking: use `SpanGroup` instead of `Iterable[Span]` for mentions (#12344) * Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span]. * Update docs. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Reverse erroneous changes during merge. * Update return type in EL tests. * Re-add Candidate to setup.py. * Format updated docs. --------- Co-authored-by: Sofie Van Landeghem --- spacy/kb/__init__.py | 1 - spacy/kb/kb.pyx | 6 +++--- spacy/ml/models/entity_linker.py | 8 ++++---- spacy/pipeline/entity_linker.py | 13 ++++++++----- spacy/tests/pipeline/test_entity_linker.py | 1 - website/docs/api/inmemorylookupkb.mdx | 5 +++-- website/docs/api/kb.mdx | 11 +++++------ 7 files changed, 23 insertions(+), 22 deletions(-) diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index ff0e209e3..c8a657d62 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -2,5 +2,4 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB from .candidate import Candidate, InMemoryCandidate - __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 1cb08f488..2d0e1d5a1 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union from cymem.cymem cimport Pool from .candidate import Candidate -from ..tokens import Span +from ..tokens import Span, SpanGroup from ..util import SimpleFrozenList from ..errors import Errors @@ -30,13 +30,13 @@ cdef class KnowledgeBase: self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: + def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: """ Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the entity's embedding vector. Depending on the KB implementation, further properties - such as the prior probability of the specified mention text resolving to that entity - might be included. If no candidates are found for a given mention, an empty list is returned. - mentions (Iterable[Span]): Mentions for which to get candidates. + mentions (SpanGroup): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return [self.get_candidates(span) for span in mentions] diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 7fe0b4741..b5122b164 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -8,7 +8,7 @@ from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB from ...kb import Candidate from ...vocab import Vocab -from ...tokens import Span, Doc +from ...tokens import Doc, Span, SpanGroup from ..extract_spans import extract_spans from ...errors import Errors @@ -114,7 +114,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateBatchGenerator.v1") def create_candidates_batch() -> Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ]: return get_candidates_batch @@ -130,12 +130,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates_batch( - kb: KnowledgeBase, mentions: Iterable[Span] + kb: KnowledgeBase, mentions: SpanGroup ) -> Iterable[Iterable[Candidate]]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - mentions (Iterable[Span]): Entity mentions for which to identify candidates. + mentions (SpanGroup): Entity mentions for which to identify candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return kb.get_candidates_batch(mentions) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index caced9cfd..ecd156db5 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -11,6 +11,8 @@ from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate from ..tokens import Doc, Span +from ..ml import empty_kb +from ..tokens import Doc, Span, SpanGroup from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language @@ -82,7 +84,7 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, @@ -105,7 +107,7 @@ def make_entity_linker( get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] + Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. @@ -170,7 +172,7 @@ class EntityLinker(TrainablePipe): entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = False, @@ -194,7 +196,7 @@ class EntityLinker(TrainablePipe): get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], + Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. @@ -473,7 +475,8 @@ class EntityLinker(TrainablePipe): batch_candidates = list( self.get_candidates_batch( - self.kb, [ent_batch[idx] for idx in valid_ent_idx] + self.kb, + SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]), ) if self.candidates_batch_size > 1 else [ diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 65406a36e..773a5b8f3 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -997,7 +997,6 @@ def test_scorer_links(): ) # fmt: on def test_legacy_architectures(name, config): - # Ensure that the legacy architectures still work vector_length = 3 nlp = English() diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index 6fa6cb235..3b33f7fb7 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -189,14 +189,15 @@ to you. > > ```python > from spacy.lang.en import English +> from spacy.tokens import SpanGroup > nlp = English() > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") -> candidates = kb.get_candidates((doc[0:2], doc[3:])) +> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) > ``` | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------ | -| `mentions` | The textual mentions. ~~Iterable[Span]~~ | +| `mentions` | The textual mentions. ~~SpanGroup~~ | | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 9536a3fe3..94506162f 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -93,14 +93,15 @@ to you. > > ```python > from spacy.lang.en import English +> from spacy.tokens import SpanGroup > nlp = English() > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") -> candidates = kb.get_candidates((doc[0:2], doc[3:])) +> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) > ``` | Name | Description | | ----------- | -------------------------------------------------------------------------------------------- | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | +| `mentions` | The textual mentions. ~~SpanGroup~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | ## KnowledgeBase.get_vector {id="get_vector",tag="method"} @@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called directly, but instead these objects are returned by the `get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe. -> #### Example```python +> #### Example > +> ```python > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) -> -> ``` -> > ``` | Name | Description | From a653dec6541c1a18ef820dc11ff7fb2a287c8665 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 27 Mar 2023 09:18:23 +0200 Subject: [PATCH 31/32] Add info that Vocab and StringStore are not static in docs (#12427) * Add size increase info about vocab and stringstore * Update website/docs/api/stringstore.mdx Co-authored-by: Raphael Mitsch * Update website/docs/api/vocab.mdx Co-authored-by: Raphael Mitsch * Change wording --------- Co-authored-by: Raphael Mitsch --- website/docs/api/stringstore.mdx | 7 +++++++ website/docs/api/vocab.mdx | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 7e380f5f8..2425c8adc 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of integer IDs. This ensures that strings always map to the same ID, even from different `StringStores`. + + +Note that a `StringStore` instance is not static. It increases in size as texts +with new tokens are processed. + + + ## StringStore.\_\_init\_\_ {id="init",tag="method"} Create the `StringStore`. diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 304040f9c..1e32eb118 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared between `Doc` objects. + + +Note that a `Vocab` instance is not static. It increases in size as texts with +new tokens are processed. + + + ## Vocab.\_\_init\_\_ {id="init",tag="method"} Create the vocabulary. From b734e5314d3b8faa7d463c265db9a823a113165c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 30 Mar 2023 09:30:42 +0200 Subject: [PATCH 32/32] Avoid `TrainablePipe.finish_update` getting called twice during training (#12450) * Avoid `TrainablePipe.finish_update` getting called twice during training PR #12136 fixed an issue where the tok2vec pipe was updated before gradient were accumulated. However, it introduced a new bug that cause `finish_update` to be called twice when using the training loop. This causes a fairly large slowdown. The `Language.update` method accepts the `sgd` argument for passing an optimizer. This argument has three possible values: - `Optimizer`: use the given optimizer to finish pipe updates. - `None`: use a default optimizer to finish pipe updates. - `False`: do not finish pipe updates. However, the latter option was not documented and not valid with the existing type of `sgd`. I assumed that this was a remnant of earlier spaCy versions and removed handling of `False`. However, with that change, we are passing `None` to `Language.update`. As a result, we were calling `finish_update` in both `Language.update` and in the training loop after all subbatches are processed. This change restores proper handling/use of `False`. Moreover, the role of `False` is now documented and added to the type to avoid future accidents. * Fix typo * Document defaults for `Language.update` --- spacy/language.py | 7 +++++-- spacy/tests/test_language.py | 18 ++++++++++++++++++ spacy/training/loop.py | 2 +- website/docs/api/language.mdx | 18 +++++++++--------- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 3b86fdde7..ce3630629 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1202,7 +1202,7 @@ class Language: _: Optional[Any] = None, *, drop: float = 0.0, - sgd: Optional[Optimizer] = None, + sgd: Union[Optimizer, None, Literal[False]] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), @@ -1213,7 +1213,9 @@ class Language: examples (Iterable[Example]): A batch of examples _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. - sgd (Optimizer): An optimizer. + sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will + be created via create_optimizer if 'None'. No optimizer will + be used when set to 'False'. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline @@ -1272,6 +1274,7 @@ class Language: name not in exclude and isinstance(proc, ty.TrainableComponent) and proc.is_trainable + and sgd not in (None, False) ): proc.finish_update(sgd) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 9b8c7b9c7..08a7d28a4 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -157,6 +157,24 @@ def test_language_update_updates(): ) +def test_language_update_does_not_update_with_sgd_false(): + config = Config().from_str(TAGGER_CFG_STRING) + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + + docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + nlp.update(train_examples, sgd=False) + docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + + xp = get_array_module(docs_after_update[0].tensor) + xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor) + + def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} diff --git a/spacy/training/loop.py b/spacy/training/loop.py index c737d7c01..587a2516c 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -210,7 +210,7 @@ def train_while_improving( subbatch, drop=dropout, losses=losses, - sgd=None, + sgd=False, exclude=exclude, annotates=annotating_components, ) diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index c25bfcee5..5cd9e4af8 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -323,15 +323,15 @@ and custom registered functions if needed. See the > nlp.update([example], sgd=optimizer) > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. Defaults to `0.0`. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ | +| `losses` | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Language.distill {id="distill",tag="method,experimental",version="4"}