mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/v4' into feature/lookups-tables-url
This commit is contained in:
		
						commit
						41037041f0
					
				
							
								
								
									
										5
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/azure-steps.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -69,6 +69,11 @@ steps: | |||
| #    displayName: 'Test skip re-download (#12188)' | ||||
| #    condition: eq(variables['python_version'], '3.8') | ||||
| 
 | ||||
| #  - script: | | ||||
| #      python -W error -m spacy info ca_core_news_sm | grep -q download_url | ||||
| #    displayName: 'Test download_url in info CLI' | ||||
| #    condition: eq(variables['python_version'] '3.8') | ||||
| 
 | ||||
|   - script: | | ||||
|       python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . | ||||
|     displayName: 'Test convert CLI' | ||||
|  |  | |||
							
								
								
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -16,7 +16,7 @@ jobs: | |||
|         with: | ||||
|             ref: ${{ github.head_ref }} | ||||
|       - uses: actions/setup-python@v4 | ||||
|       - run: pip install black | ||||
|       - run: pip install black -c requirements.txt | ||||
|       - name: Auto-format code if needed | ||||
|         run: black spacy | ||||
|       # We can't run black --check here because that returns a non-zero excit | ||||
|  |  | |||
|  | @ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its | |||
| Python modules. If you've built spaCy from source, you'll already have both | ||||
| tools installed. | ||||
| 
 | ||||
| As a general rule of thumb, we use f-strings for any formatting of strings. | ||||
| One exception are calls to Python's `logging` functionality. | ||||
| To avoid unnecessary string conversions in these cases, we use string formatting | ||||
| templates with `%s` and `%d` etc. | ||||
| 
 | ||||
| **⚠️ Note that formatting and linting is currently only possible for Python | ||||
| modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** | ||||
| 
 | ||||
|  |  | |||
|  | @ -41,7 +41,7 @@ jobs: | |||
|         inputs: | ||||
|           versionSpec: "3.8" | ||||
|       - script: | | ||||
|           pip install black==22.3.0 | ||||
|           pip install black -c requirements.txt | ||||
|           python -m black spacy --check | ||||
|         displayName: "black" | ||||
|       - script: | | ||||
|  |  | |||
|  | @ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 | |||
| mock>=2.0.0,<3.0.0 | ||||
| flake8>=3.8.0,<6.0.0 | ||||
| hypothesis>=3.27.0,<7.0.0 | ||||
| mypy>=0.990,<0.1000; platform_machine != "aarch64" | ||||
| mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" | ||||
| types-dataclasses>=0.1.3; python_version < "3.7" | ||||
| types-mock>=0.1.1 | ||||
| types-setuptools>=57.0.0 | ||||
| types-requests | ||||
| types-setuptools>=57.0.0 | ||||
| black>=22.0,<23.0 | ||||
| black==22.3.0 | ||||
|  |  | |||
|  | @ -90,9 +90,9 @@ def parse_config_overrides( | |||
|     cli_overrides = _parse_overrides(args, is_cli=True) | ||||
|     if cli_overrides: | ||||
|         keys = [k for k in cli_overrides if k not in env_overrides] | ||||
|         logger.debug(f"Config overrides from CLI: {keys}") | ||||
|         logger.debug("Config overrides from CLI: %s", keys) | ||||
|     if env_overrides: | ||||
|         logger.debug(f"Config overrides from env variables: {list(env_overrides)}") | ||||
|         logger.debug("Config overrides from env variables: %s", list(env_overrides)) | ||||
|     return {**cli_overrides, **env_overrides} | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,10 +1,10 @@ | |||
| from typing import Optional, Dict, Any, Union, List | ||||
| import platform | ||||
| import pkg_resources | ||||
| import json | ||||
| from pathlib import Path | ||||
| from wasabi import Printer, MarkdownRenderer | ||||
| import srsly | ||||
| import importlib.metadata | ||||
| 
 | ||||
| from ._util import app, Arg, Opt, string_to_list | ||||
| from .download import get_model_filename, get_latest_version | ||||
|  | @ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: | |||
|     dist-info available. | ||||
|     """ | ||||
|     try: | ||||
|         dist = pkg_resources.get_distribution(model) | ||||
|         data = json.loads(dist.get_metadata("direct_url.json")) | ||||
|         return data["url"] | ||||
|     except pkg_resources.DistributionNotFound: | ||||
|         # no such package | ||||
|         return None | ||||
|         dist = importlib.metadata.distribution(model) | ||||
|         text = dist.read_text("direct_url.json") | ||||
|         if isinstance(text, str): | ||||
|             data = json.loads(text) | ||||
|             return data["url"] | ||||
|     except Exception: | ||||
|         # something else, like no file or invalid JSON | ||||
|         return None | ||||
|         pass | ||||
|     return None | ||||
| 
 | ||||
| 
 | ||||
| def info_model_url(model: str) -> Dict[str, Any]: | ||||
|  |  | |||
|  | @ -252,7 +252,7 @@ def get_third_party_dependencies( | |||
|                     raise regerr from None | ||||
|             module_name = func_info.get("module")  # type: ignore[attr-defined] | ||||
|             if module_name:  # the code is part of a module, not a --code file | ||||
|                 modules.add(func_info["module"].split(".")[0])  # type: ignore[index] | ||||
|                 modules.add(func_info["module"].split(".")[0])  # type: ignore[union-attr] | ||||
|     dependencies = [] | ||||
|     for module_name in modules: | ||||
|         if module_name in distributions: | ||||
|  |  | |||
|  | @ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | |||
|     # in the list. | ||||
|     while commands: | ||||
|         for i, cmd in enumerate(list(commands)): | ||||
|             logger.debug(f"CMD: {cmd['name']}.") | ||||
|             logger.debug("CMD: %s.", cmd["name"]) | ||||
|             deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||
|             if all(dep.exists() for dep in deps): | ||||
|                 cmd_hash = get_command_hash("", "", deps, cmd["script"]) | ||||
|                 for output_path in cmd.get("outputs", []): | ||||
|                     url = storage.pull(output_path, command_hash=cmd_hash) | ||||
|                     logger.debug( | ||||
|                         f"URL: {url} for {output_path} with command hash {cmd_hash}" | ||||
|                         "URL: %s for %s with command hash %s", | ||||
|                         url, | ||||
|                         output_path, | ||||
|                         cmd_hash, | ||||
|                     ) | ||||
|                     yield url, output_path | ||||
| 
 | ||||
|  | @ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | |||
|                 commands.pop(i) | ||||
|                 break | ||||
|             else: | ||||
|                 logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") | ||||
|                 logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) | ||||
|         else: | ||||
|             # If we didn't break the for loop, break the while loop. | ||||
|             break | ||||
|  |  | |||
|  | @ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): | |||
|         remote = config["remotes"][remote] | ||||
|     storage = RemoteStorage(project_dir, remote) | ||||
|     for cmd in config.get("commands", []): | ||||
|         logger.debug(f"CMD: cmd['name']") | ||||
|         logger.debug("CMD: %s", cmd["name"]) | ||||
|         deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||
|         if any(not dep.exists() for dep in deps): | ||||
|             logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") | ||||
|             logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) | ||||
|             continue | ||||
|         cmd_hash = get_command_hash( | ||||
|             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] | ||||
|         ) | ||||
|         logger.debug(f"CMD_HASH: {cmd_hash}") | ||||
|         logger.debug("CMD_HASH: %s", cmd_hash) | ||||
|         for output_path in cmd.get("outputs", []): | ||||
|             output_loc = project_dir / output_path | ||||
|             if output_loc.exists() and _is_not_empty_dir(output_loc): | ||||
|  | @ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): | |||
|                     content_hash=get_content_hash(output_loc), | ||||
|                 ) | ||||
|                 logger.debug( | ||||
|                     f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" | ||||
|                     "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash | ||||
|                 ) | ||||
|                 yield output_path, url | ||||
| 
 | ||||
|  |  | |||
|  | @ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple | |||
| import os.path | ||||
| from pathlib import Path | ||||
| 
 | ||||
| import pkg_resources | ||||
| from wasabi import msg | ||||
| from wasabi.util import locale_escape | ||||
| import sys | ||||
|  | @ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: | |||
|     RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts | ||||
|         exist. | ||||
|     """ | ||||
|     import pkg_resources | ||||
| 
 | ||||
|     failed_pkgs_msgs: List[str] = [] | ||||
|     conflicting_pkgs_msgs: List[str] = [] | ||||
|  |  | |||
|  | @ -84,7 +84,7 @@ class Warnings(metaclass=ErrorsWithCodes): | |||
|             "ignoring the duplicate entry.") | ||||
|     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " | ||||
|             "incorrect. Modify PhraseMatcher._terminal_hash to fix.") | ||||
|     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " | ||||
|     W024 = ("Entity '{entity}' - alias '{alias}' combination already exists in " | ||||
|             "the Knowledge Base.") | ||||
|     W026 = ("Unable to set all sentence boundaries from dependency parses. If " | ||||
|             "you are constructing a parse tree incrementally by setting " | ||||
|  | @ -212,7 +212,11 @@ class Warnings(metaclass=ErrorsWithCodes): | |||
|             "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") | ||||
|     W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") | ||||
| 
 | ||||
|     # v4 warning strings | ||||
|     W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") | ||||
|     W401 = ("`incl_prior is True`, but the selected knowledge base type {kb_type} doesn't support prior probability " | ||||
|             "lookups so this setting will be ignored. If your KB does support prior probability lookups, make sure " | ||||
|             "to return `True` in `.supports_prior_probs`.") | ||||
| 
 | ||||
| 
 | ||||
| class Errors(metaclass=ErrorsWithCodes): | ||||
|  | @ -440,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|     E133 = ("The sum of prior probabilities for alias '{alias}' should not " | ||||
|             "exceed 1, but found {sum}.") | ||||
|     E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") | ||||
|     E139 = ("Knowledge base for component '{name}' is empty. Use the methods " | ||||
|             "`kb.add_entity` and `kb.add_alias` to add entries.") | ||||
|     E139 = ("Knowledge base for component '{name}' is empty.") | ||||
|     E140 = ("The list of entities, prior probabilities and entity vectors " | ||||
|             "should be of equal length.") | ||||
|     E141 = ("Entity vectors should be of length {required} instead of the " | ||||
|  | @ -954,7 +957,7 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|     E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " | ||||
|              "with `displacy.serve(doc, port=port)`") | ||||
|     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " | ||||
|              "or use `auto_switch_port=True` to pick an available port automatically.") | ||||
|              "or use `auto_select_port=True` to pick an available port automatically.") | ||||
| 
 | ||||
|     # v4 error strings | ||||
|     E4000 = ("Expected a Doc as input, but got: '{type}'") | ||||
|  | @ -964,7 +967,9 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|     E4003 = ("Training examples for distillation must have the exact same tokens in the " | ||||
|              "reference and predicted docs.") | ||||
|     E4004 = ("Backprop is not supported when is_train is not set.") | ||||
|     E4005 = ("Required lemmatizer table(s) {missing_tables} not found in " | ||||
|     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") | ||||
|     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") | ||||
|     E4007 = ("Required lemmatizer table(s) {missing_tables} not found in " | ||||
|              "[initialize] or in registered lookups (spacy-lookups-data). An " | ||||
|              "example for how to load lemmatizer tables in [initialize]:\n\n" | ||||
|              "[initialize.components]\n\n" | ||||
|  | @ -975,7 +980,8 @@ class Errors(metaclass=ErrorsWithCodes): | |||
|              f'url = "{about.__lookups_url__}"\n' | ||||
|              "tables = {tables}\n" | ||||
|              "# or required tables only: tables = {required_tables}\n") | ||||
|     E4006 = ("Server error ({status_code}), couldn't fetch {url}") | ||||
|     E4008 = ("Server error ({status_code}), couldn't fetch {url}") | ||||
| 
 | ||||
| 
 | ||||
| RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,3 +1,5 @@ | |||
| from .kb import KnowledgeBase | ||||
| from .kb_in_memory import InMemoryLookupKB | ||||
| from .candidate import Candidate, get_candidates, get_candidates_batch | ||||
| from .candidate import Candidate, InMemoryCandidate | ||||
| 
 | ||||
| __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] | ||||
|  |  | |||
|  | @ -1,12 +1,15 @@ | |||
| from .kb cimport KnowledgeBase | ||||
| from libcpp.vector cimport vector | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| from ..typedefs cimport hash_t | ||||
| 
 | ||||
| # Object used by the Entity Linker that summarizes one entity-alias candidate combination. | ||||
| cdef class Candidate: | ||||
|     cdef readonly KnowledgeBase kb | ||||
|     cdef hash_t entity_hash | ||||
|     cdef float entity_freq | ||||
|     cdef vector[float] entity_vector | ||||
|     cdef hash_t alias_hash | ||||
|     cdef float prior_prob | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryCandidate(Candidate): | ||||
|     cdef readonly hash_t _entity_hash | ||||
|     cdef readonly hash_t _alias_hash | ||||
|     cpdef vector[float] _entity_vector | ||||
|     cdef float _prior_prob | ||||
|     cdef readonly InMemoryLookupKB _kb | ||||
|     cdef float _entity_freq | ||||
|  |  | |||
|  | @ -1,74 +1,96 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| 
 | ||||
| from typing import Iterable | ||||
| from .kb cimport KnowledgeBase | ||||
| from ..tokens import Span | ||||
| from .kb_in_memory cimport InMemoryLookupKB | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| cdef class Candidate: | ||||
|     """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved | ||||
|     to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking | ||||
|     """A `Candidate` object refers to a textual mention that may or may not be resolved | ||||
|     to a specific entity from a Knowledge Base. This will be used as input for the entity linking | ||||
|     algorithm which will disambiguate the various candidates to the correct one. | ||||
|     Each candidate (alias, entity) pair is assigned a certain prior probability. | ||||
|     Each candidate, which represents a possible link between one textual mention and one entity in the knowledge base, | ||||
|     is assigned a certain prior probability. | ||||
| 
 | ||||
|     DOCS: https://spacy.io/api/kb/#candidate-init | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): | ||||
|         self.kb = kb | ||||
|         self.entity_hash = entity_hash | ||||
|         self.entity_freq = entity_freq | ||||
|         self.entity_vector = entity_vector | ||||
|         self.alias_hash = alias_hash | ||||
|         self.prior_prob = prior_prob | ||||
|     def __init__(self): | ||||
|         # Make sure abstract Candidate is not instantiated. | ||||
|         if self.__class__ == Candidate: | ||||
|             raise TypeError( | ||||
|                 Errors.E1046.format(cls_name=self.__class__.__name__) | ||||
|             ) | ||||
| 
 | ||||
|     @property | ||||
|     def entity(self) -> int: | ||||
|         """RETURNS (uint64): hash of the entity's KB ID/name""" | ||||
|         return self.entity_hash | ||||
|     def entity_id(self) -> int: | ||||
|         """RETURNS (int): Numerical representation of entity ID (if entity ID is numerical, this is just the entity ID, | ||||
|         otherwise the hash of the entity ID string).""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     @property | ||||
|     def entity_(self) -> str: | ||||
|         """RETURNS (str): ID/name of this entity in the KB""" | ||||
|         return self.kb.vocab.strings[self.entity_hash] | ||||
|     def entity_id_(self) -> str: | ||||
|         """RETURNS (str): String representation of entity ID.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     @property | ||||
|     def alias(self) -> int: | ||||
|         """RETURNS (uint64): hash of the alias""" | ||||
|         return self.alias_hash | ||||
|     def entity_vector(self) -> vector[float]: | ||||
|         """RETURNS (vector[float]): Entity vector.""" | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryCandidate(Candidate): | ||||
|     """Candidate for InMemoryLookupKB.""" | ||||
| 
 | ||||
|     def __init__( | ||||
|         self, | ||||
|         kb: InMemoryLookupKB, | ||||
|         entity_hash: int, | ||||
|         alias_hash: int, | ||||
|         entity_vector: vector[float], | ||||
|         prior_prob: float, | ||||
|         entity_freq: float | ||||
|     ): | ||||
|         """ | ||||
|         kb (InMemoryLookupKB]): InMemoryLookupKB instance. | ||||
|         entity_id (int): Entity ID as hash that can be looked up with InMemoryKB.vocab.strings.__getitem__(). | ||||
|         entity_freq (int): Entity frequency in KB corpus. | ||||
|         entity_vector (List[float]): Entity embedding. | ||||
|         alias_hash (int): Alias hash. | ||||
|         prior_prob (float): Prior probability of entity for this alias. I. e. the probability that, independent of | ||||
|             the context, this alias - which matches one of this entity's aliases - resolves to one this entity. | ||||
|         """ | ||||
|         super().__init__() | ||||
| 
 | ||||
|         self._entity_hash = entity_hash | ||||
|         self._entity_vector = entity_vector | ||||
|         self._prior_prob = prior_prob | ||||
|         self._kb = kb | ||||
|         self._alias_hash = alias_hash | ||||
|         self._entity_freq = entity_freq | ||||
| 
 | ||||
|     @property | ||||
|     def alias_(self) -> str: | ||||
|         """RETURNS (str): ID of the original alias""" | ||||
|         return self.kb.vocab.strings[self.alias_hash] | ||||
|     def entity_id(self) -> int: | ||||
|         return self._entity_hash | ||||
| 
 | ||||
|     @property | ||||
|     def entity_freq(self) -> float: | ||||
|         return self.entity_freq | ||||
| 
 | ||||
|     @property | ||||
|     def entity_vector(self) -> Iterable[float]: | ||||
|         return self.entity_vector | ||||
|     def entity_vector(self) -> vector[float]: | ||||
|         return self._entity_vector | ||||
| 
 | ||||
|     @property | ||||
|     def prior_prob(self) -> float: | ||||
|         return self.prior_prob | ||||
|         """RETURNS (float): Prior probability that this alias, which matches one of this entity's synonyms, resolves to | ||||
|         this entity.""" | ||||
|         return self._prior_prob | ||||
| 
 | ||||
|     @property | ||||
|     def alias(self) -> str: | ||||
|         """RETURNS (str): Alias.""" | ||||
|         return self._kb.vocab.strings[self._alias_hash] | ||||
| 
 | ||||
| def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: | ||||
|     """ | ||||
|     Return candidate entities for a given mention and fetching appropriate entries from the index. | ||||
|     kb (KnowledgeBase): Knowledge base to query. | ||||
|     mention (Span): Entity mention for which to identify candidates. | ||||
|     RETURNS (Iterable[Candidate]): Identified candidates. | ||||
|     """ | ||||
|     return kb.get_candidates(mention) | ||||
|     @property | ||||
|     def entity_id_(self) -> str: | ||||
|         return self._kb.vocab.strings[self._entity_hash] | ||||
| 
 | ||||
| 
 | ||||
| def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: | ||||
|     """ | ||||
|     Return candidate entities for the given mentions and fetching appropriate entries from the index. | ||||
|     kb (KnowledgeBase): Knowledge base to query. | ||||
|     mention (Iterable[Span]): Entity mentions for which to identify candidates. | ||||
|     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. | ||||
|     """ | ||||
|     return kb.get_candidates_batch(mentions) | ||||
|     @property | ||||
|     def entity_freq(self) -> float: | ||||
|         """RETURNS (float): Entity frequency in KB corpus.""" | ||||
|         return self._entity_freq | ||||
|  |  | |||
|  | @ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union | |||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| from .candidate import Candidate | ||||
| from ..tokens import Span | ||||
| from ..tokens import Span, SpanGroup | ||||
| from ..util import SimpleFrozenList | ||||
| from ..errors import Errors | ||||
| 
 | ||||
|  | @ -30,21 +30,23 @@ cdef class KnowledgeBase: | |||
|         self.entity_vector_length = entity_vector_length | ||||
|         self.mem = Pool() | ||||
| 
 | ||||
|     def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: | ||||
|     def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: | ||||
|         """ | ||||
|         Return candidate entities for specified texts. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|         If no candidate is found for a given text, an empty list is returned. | ||||
|         mentions (Iterable[Span]): Mentions for which to get candidates. | ||||
|         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the | ||||
|         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||
|         probability of the specified mention text resolving to that entity - might be included. | ||||
|         If no candidates are found for a given mention, an empty list is returned. | ||||
|         mentions (SpanGroup): Mentions for which to get candidates. | ||||
|         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. | ||||
|         """ | ||||
|         return [self.get_candidates(span) for span in mentions] | ||||
| 
 | ||||
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]: | ||||
|         """ | ||||
|         Return candidate entities for specified text. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|         If the no candidate is found for a given text, an empty list is returned. | ||||
|         Return candidate entities for a specific mention. Each candidate defines at least the entity and the | ||||
|         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior | ||||
|         probability of the specified mention text resolving to that entity - might be included. | ||||
|         If no candidate is found for the given mention, an empty list is returned. | ||||
|         mention (Span): Mention for which to get candidates. | ||||
|         RETURNS (Iterable[Candidate]): Identified candidates. | ||||
|         """ | ||||
|  | @ -106,3 +108,10 @@ cdef class KnowledgeBase: | |||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) | ||||
|         ) | ||||
| 
 | ||||
|     @property | ||||
|     def supports_prior_probs(self) -> bool: | ||||
|         """RETURNS (bool): Whether this KB type supports looking up prior probabilities for entity mentions.""" | ||||
|         raise NotImplementedError( | ||||
|             Errors.E1045.format(parent="KnowledgeBase", method="supports_prior_probs", name=self.__name__) | ||||
|         ) | ||||
|  |  | |||
|  | @ -18,7 +18,7 @@ from .. import util | |||
| from ..util import SimpleFrozenList, ensure_path | ||||
| from ..vocab cimport Vocab | ||||
| from .kb cimport KnowledgeBase | ||||
| from .candidate import Candidate as Candidate | ||||
| from .candidate import InMemoryCandidate | ||||
| 
 | ||||
| 
 | ||||
| cdef class InMemoryLookupKB(KnowledgeBase): | ||||
|  | @ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         self._alias_index = PreshMap(nr_aliases + 1) | ||||
|         self._aliases_table = alias_vec(nr_aliases + 1) | ||||
| 
 | ||||
|     def is_empty(self): | ||||
|         return len(self) == 0 | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return self.get_size_entities() | ||||
| 
 | ||||
|  | @ -223,10 +226,10 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|             alias_entry.probs = probs | ||||
|             self._aliases_table[alias_index] = alias_entry | ||||
| 
 | ||||
|     def get_candidates(self, mention: Span) -> Iterable[Candidate]: | ||||
|         return self.get_alias_candidates(mention.text)  # type: ignore | ||||
|     def get_candidates(self, mention: Span) -> Iterable[InMemoryCandidate]: | ||||
|         return self._get_alias_candidates(mention.text)  # type: ignore | ||||
| 
 | ||||
|     def get_alias_candidates(self, str alias) -> Iterable[Candidate]: | ||||
|     def _get_alias_candidates(self, str alias) -> Iterable[InMemoryCandidate]: | ||||
|         """ | ||||
|         Return candidate entities for an alias. Each candidate defines the entity, the original alias, | ||||
|         and the prior probability of that alias resolving to that entity. | ||||
|  | @ -238,14 +241,18 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
|         alias_index = <int64_t>self._alias_index.get(alias_hash) | ||||
|         alias_entry = self._aliases_table[alias_index] | ||||
| 
 | ||||
|         return [Candidate(kb=self, | ||||
|                           entity_hash=self._entries[entry_index].entity_hash, | ||||
|                           entity_freq=self._entries[entry_index].freq, | ||||
|                           entity_vector=self._vectors_table[self._entries[entry_index].vector_index], | ||||
|                           alias_hash=alias_hash, | ||||
|                           prior_prob=prior_prob) | ||||
|                 for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) | ||||
|                 if entry_index != 0] | ||||
|         return [ | ||||
|             InMemoryCandidate( | ||||
|                 kb=self, | ||||
|                 entity_hash=self._entries[entry_index].entity_hash, | ||||
|                 alias_hash=alias_hash, | ||||
|                 entity_vector=self._vectors_table[self._entries[entry_index].vector_index], | ||||
|                 prior_prob=prior_prob, | ||||
|                 entity_freq=self._entries[entry_index].freq | ||||
|             ) | ||||
|             for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) | ||||
|             if entry_index != 0 | ||||
|         ] | ||||
| 
 | ||||
|     def get_vector(self, str entity): | ||||
|         cdef hash_t entity_hash = self.vocab.strings[entity] | ||||
|  | @ -276,6 +283,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | |||
| 
 | ||||
|         return 0.0 | ||||
| 
 | ||||
|     def supports_prior_probs(self) -> bool: | ||||
|         return True | ||||
| 
 | ||||
|     def to_bytes(self, **kwargs): | ||||
|         """Serialize the current state to a binary string. | ||||
|         """ | ||||
|  |  | |||
|  | @ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS | |||
| from .syntax_iterators import SYNTAX_ITERATORS | ||||
| from ...language import Language, BaseDefaults | ||||
| from ...pipeline import Lemmatizer | ||||
| 
 | ||||
| 
 | ||||
| # Punctuation stolen from Danish | ||||
| from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| 
 | ||||
| class SwedishDefaults(BaseDefaults): | ||||
|  |  | |||
							
								
								
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | |||
| from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||
| from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||
| from ..punctuation import TOKENIZER_SUFFIXES | ||||
| 
 | ||||
| 
 | ||||
| _quotes = CONCAT_QUOTES.replace("'", "") | ||||
| 
 | ||||
| _infixes = ( | ||||
|     LIST_ELLIPSES | ||||
|     + LIST_ICONS | ||||
|     + [ | ||||
|         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||
|         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), | ||||
|         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||
|         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), | ||||
|         r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), | ||||
|     ] | ||||
| ) | ||||
| 
 | ||||
| _suffixes = [ | ||||
|     suffix | ||||
|     for suffix in TOKENIZER_SUFFIXES | ||||
|     if suffix not in ["'s", "'S", "’s", "’S", r"\'"] | ||||
| ] | ||||
| _suffixes += [r"(?<=[^sSxXzZ])\'"] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_INFIXES = _infixes | ||||
| TOKENIZER_SUFFIXES = _suffixes | ||||
|  | @ -2065,7 +2065,7 @@ class Language: | |||
|         pipe = self.get_pipe(pipe_name) | ||||
|         pipe_cfg = self._pipe_configs[pipe_name] | ||||
|         if listeners: | ||||
|             util.logger.debug(f"Replacing listeners of component '{pipe_name}'") | ||||
|             util.logger.debug("Replacing listeners of component '%s'", pipe_name) | ||||
|             if len(list(listeners)) != len(pipe_listeners): | ||||
|                 # The number of listeners defined in the component model doesn't | ||||
|                 # match the listeners to replace, so we won't be able to update | ||||
|  |  | |||
|  | @ -30,7 +30,7 @@ def load_lookups_data_from_url(lang, tables, url): | |||
|         r = requests.get(table_url) | ||||
|         if r.status_code != 200: | ||||
|             raise ValueError( | ||||
|                 Errors.E4006.format(status_code=r.status_code, url=table_url) | ||||
|                 Errors.E4008.format(status_code=r.status_code, url=table_url) | ||||
|             ) | ||||
|         table_data = r.json() | ||||
|         lookups.add_table(table, table_data) | ||||
|  |  | |||
|  | @ -82,8 +82,12 @@ cdef class DependencyMatcher: | |||
|             "$-": self._imm_left_sib, | ||||
|             "$++": self._right_sib, | ||||
|             "$--": self._left_sib, | ||||
|             ">+": self._imm_right_child, | ||||
|             ">-": self._imm_left_child, | ||||
|             ">++": self._right_child, | ||||
|             ">--": self._left_child, | ||||
|             "<+": self._imm_right_parent, | ||||
|             "<-": self._imm_left_parent, | ||||
|             "<++": self._right_parent, | ||||
|             "<--": self._left_parent, | ||||
|         } | ||||
|  | @ -427,12 +431,34 @@ cdef class DependencyMatcher: | |||
|     def _left_sib(self, doc, node): | ||||
|         return [doc[child.i] for child in doc[node].head.children if child.i < node] | ||||
| 
 | ||||
|     def _imm_right_child(self, doc, node): | ||||
|         for child in doc[node].children: | ||||
|             if child.i == node + 1: | ||||
|                 return [doc[child.i]] | ||||
|         return [] | ||||
| 
 | ||||
|     def _imm_left_child(self, doc, node): | ||||
|         for child in doc[node].children: | ||||
|             if child.i == node - 1: | ||||
|                 return [doc[child.i]] | ||||
|         return [] | ||||
| 
 | ||||
|     def _right_child(self, doc, node): | ||||
|         return [doc[child.i] for child in doc[node].children if child.i > node] | ||||
|      | ||||
|     def _left_child(self, doc, node): | ||||
|         return [doc[child.i] for child in doc[node].children if child.i < node] | ||||
| 
 | ||||
|     def _imm_right_parent(self, doc, node): | ||||
|         if doc[node].head.i == node + 1: | ||||
|             return [doc[node].head] | ||||
|         return [] | ||||
| 
 | ||||
|     def _imm_left_parent(self, doc, node): | ||||
|         if doc[node].head.i == node - 1: | ||||
|             return [doc[node].head] | ||||
|         return [] | ||||
| 
 | ||||
|     def _right_parent(self, doc, node): | ||||
|         if doc[node].head.i > node: | ||||
|             return [doc[node].head] | ||||
|  |  | |||
|  | @ -829,6 +829,11 @@ def _get_attr_values(spec, string_store): | |||
|     return attr_values | ||||
| 
 | ||||
| 
 | ||||
| def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): | ||||
|     # tuple order affects performance | ||||
|     return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) | ||||
| 
 | ||||
| 
 | ||||
| # These predicate helper classes are used to match the REGEX, IN, >= etc | ||||
| # extensions to the matcher introduced in #3173. | ||||
| 
 | ||||
|  | @ -848,7 +853,7 @@ class _FuzzyPredicate: | |||
|         fuzz = self.predicate[len("FUZZY"):] # number after prefix | ||||
|         self.fuzzy = int(fuzz) if fuzz else -1 | ||||
|         self.fuzzy_compare = fuzzy_compare | ||||
|         self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) | ||||
|         self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) | ||||
| 
 | ||||
|     def __call__(self, Token token): | ||||
|         if self.is_extension: | ||||
|  | @ -870,7 +875,7 @@ class _RegexPredicate: | |||
|         self.value = re.compile(value) | ||||
|         self.predicate = predicate | ||||
|         self.is_extension = is_extension | ||||
|         self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) | ||||
|         self.key = _predicate_cache_key(self.attr, self.predicate, value) | ||||
|         if self.predicate not in self.operators: | ||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||
| 
 | ||||
|  | @ -906,7 +911,7 @@ class _SetPredicate: | |||
|                 self.value = set(get_string_id(v) for v in value) | ||||
|         self.predicate = predicate | ||||
|         self.is_extension = is_extension | ||||
|         self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) | ||||
|         self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) | ||||
|         if self.predicate not in self.operators: | ||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||
| 
 | ||||
|  | @ -978,7 +983,7 @@ class _ComparisonPredicate: | |||
|         self.value = value | ||||
|         self.predicate = predicate | ||||
|         self.is_extension = is_extension | ||||
|         self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) | ||||
|         self.key = _predicate_cache_key(self.attr, self.predicate, value) | ||||
|         if self.predicate not in self.operators: | ||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||
| 
 | ||||
|  | @ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, | |||
|         if isinstance(value, dict): | ||||
|             for type_, cls in predicate_types.items(): | ||||
|                 if type_ in value: | ||||
|                     key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) | ||||
|                     key = _predicate_cache_key(attr, type_, value[type_]) | ||||
|                     if key in seen_predicates: | ||||
|                         output.append(seen_predicates[key]) | ||||
|                     else: | ||||
|  |  | |||
|  | @ -6,9 +6,9 @@ from thinc.api import Model, Maxout, Linear, tuplify, Ragged | |||
| 
 | ||||
| from ...util import registry | ||||
| from ...kb import KnowledgeBase, InMemoryLookupKB | ||||
| from ...kb import Candidate, get_candidates, get_candidates_batch | ||||
| from ...kb import Candidate | ||||
| from ...vocab import Vocab | ||||
| from ...tokens import Span, Doc | ||||
| from ...tokens import Doc, Span, SpanGroup | ||||
| from ..extract_spans import extract_spans | ||||
| from ...errors import Errors | ||||
| 
 | ||||
|  | @ -89,6 +89,14 @@ def load_kb( | |||
|     return kb_from_file | ||||
| 
 | ||||
| 
 | ||||
| @registry.misc("spacy.EmptyKB.v2") | ||||
| def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: | ||||
|     def empty_kb_factory(vocab: Vocab, entity_vector_length: int): | ||||
|         return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) | ||||
| 
 | ||||
|     return empty_kb_factory | ||||
| 
 | ||||
| 
 | ||||
| @registry.misc("spacy.EmptyKB.v1") | ||||
| def empty_kb( | ||||
|     entity_vector_length: int, | ||||
|  | @ -106,6 +114,28 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: | |||
| 
 | ||||
| @registry.misc("spacy.CandidateBatchGenerator.v1") | ||||
| def create_candidates_batch() -> Callable[ | ||||
|     [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] | ||||
|     [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] | ||||
| ]: | ||||
|     return get_candidates_batch | ||||
| 
 | ||||
| 
 | ||||
| def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: | ||||
|     """ | ||||
|     Return candidate entities for a given mention and fetching appropriate entries from the index. | ||||
|     kb (KnowledgeBase): Knowledge base to query. | ||||
|     mention (Span): Entity mention for which to identify candidates. | ||||
|     RETURNS (Iterable[Candidate]): Identified candidates. | ||||
|     """ | ||||
|     return kb.get_candidates(mention) | ||||
| 
 | ||||
| 
 | ||||
| def get_candidates_batch( | ||||
|     kb: KnowledgeBase, mentions: SpanGroup | ||||
| ) -> Iterable[Iterable[Candidate]]: | ||||
|     """ | ||||
|     Return candidate entities for the given mentions and fetching appropriate entries from the index. | ||||
|     kb (KnowledgeBase): Knowledge base to query. | ||||
|     mentions (SpanGroup): Entity mentions for which to identify candidates. | ||||
|     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. | ||||
|     """ | ||||
|     return kb.get_candidates_batch(mentions) | ||||
|  |  | |||
|  | @ -249,9 +249,11 @@ cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, | |||
|     cdef np.ndarray step_actions | ||||
| 
 | ||||
|     scores = [] | ||||
|     while sizes.states >= 1: | ||||
|     while sizes.states >= 1 and (actions is None or len(actions) > 0): | ||||
|         step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") | ||||
|         step_actions = actions[0] if actions is not None else None | ||||
|         assert step_actions is None or step_actions.size == sizes.states, \ | ||||
|             f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})" | ||||
|         with nogil: | ||||
|             _predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes) | ||||
|             if actions is None: | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any | ||||
| from typing import cast | ||||
| import warnings | ||||
| from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any, cast | ||||
| from numpy import dtype | ||||
| from thinc.types import Floats1d, Floats2d, Ints1d, Ragged | ||||
| from pathlib import Path | ||||
|  | @ -10,14 +10,15 @@ from thinc.api import CosineDistance, Model, Optimizer, Config | |||
| from thinc.api import set_dropout_rate | ||||
| 
 | ||||
| from ..kb import KnowledgeBase, Candidate | ||||
| from ..ml import empty_kb | ||||
| from ..tokens import Doc, Span | ||||
| from ..ml import empty_kb | ||||
| from ..tokens import Doc, Span, SpanGroup | ||||
| from .pipe import deserialize_config | ||||
| from .trainable_pipe import TrainablePipe | ||||
| from ..language import Language | ||||
| from ..vocab import Vocab | ||||
| from ..training import Example, validate_examples, validate_get_examples | ||||
| from ..errors import Errors | ||||
| from ..errors import Errors, Warnings | ||||
| from ..util import SimpleFrozenList, registry | ||||
| from .. import util | ||||
| from ..scorer import Scorer | ||||
|  | @ -58,6 +59,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] | |||
|         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, | ||||
|         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, | ||||
|         "overwrite": False, | ||||
|         "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, | ||||
|         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, | ||||
|         "use_gold_ents": True, | ||||
|         "candidates_batch_size": 1, | ||||
|  | @ -82,8 +84,9 @@ def make_entity_linker( | |||
|     entity_vector_length: int, | ||||
|     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], | ||||
|     get_candidates_batch: Callable[ | ||||
|         [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] | ||||
|         [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] | ||||
|     ], | ||||
|     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], | ||||
|     overwrite: bool, | ||||
|     scorer: Optional[Callable], | ||||
|     use_gold_ents: bool, | ||||
|  | @ -104,8 +107,9 @@ def make_entity_linker( | |||
|     get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that | ||||
|         produces a list of candidates, given a certain knowledge base and a textual mention. | ||||
|     get_candidates_batch ( | ||||
|         Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] | ||||
|         Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] | ||||
|         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. | ||||
|     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. | ||||
|     scorer (Optional[Callable]): The scoring method. | ||||
|     use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another | ||||
|         component must provide entity annotations. | ||||
|  | @ -114,28 +118,9 @@ def make_entity_linker( | |||
|         prediction is discarded. If None, predictions are not filtered by any threshold. | ||||
|     save_activations (bool): save model activations in Doc when annotating. | ||||
|     """ | ||||
| 
 | ||||
|     if not model.attrs.get("include_span_maker", False): | ||||
|         try: | ||||
|             from spacy_legacy.components.entity_linker import EntityLinker_v1 | ||||
|         except: | ||||
|             raise ImportError( | ||||
|                 "In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12." | ||||
|             ) | ||||
|         # The only difference in arguments here is that use_gold_ents and threshold aren't available. | ||||
|         return EntityLinker_v1( | ||||
|             nlp.vocab, | ||||
|             model, | ||||
|             name, | ||||
|             labels_discard=labels_discard, | ||||
|             n_sents=n_sents, | ||||
|             incl_prior=incl_prior, | ||||
|             incl_context=incl_context, | ||||
|             entity_vector_length=entity_vector_length, | ||||
|             get_candidates=get_candidates, | ||||
|             overwrite=overwrite, | ||||
|             scorer=scorer, | ||||
|         ) | ||||
|         raise ValueError(Errors.E4005) | ||||
| 
 | ||||
|     return EntityLinker( | ||||
|         nlp.vocab, | ||||
|         model, | ||||
|  | @ -147,6 +132,7 @@ def make_entity_linker( | |||
|         entity_vector_length=entity_vector_length, | ||||
|         get_candidates=get_candidates, | ||||
|         get_candidates_batch=get_candidates_batch, | ||||
|         generate_empty_kb=generate_empty_kb, | ||||
|         overwrite=overwrite, | ||||
|         scorer=scorer, | ||||
|         use_gold_ents=use_gold_ents, | ||||
|  | @ -186,8 +172,9 @@ class EntityLinker(TrainablePipe): | |||
|         entity_vector_length: int, | ||||
|         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], | ||||
|         get_candidates_batch: Callable[ | ||||
|             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] | ||||
|             [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] | ||||
|         ], | ||||
|         generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], | ||||
|         overwrite: bool = False, | ||||
|         scorer: Optional[Callable] = entity_linker_score, | ||||
|         use_gold_ents: bool, | ||||
|  | @ -209,9 +196,10 @@ class EntityLinker(TrainablePipe): | |||
|         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that | ||||
|             produces a list of candidates, given a certain knowledge base and a textual mention. | ||||
|         get_candidates_batch ( | ||||
|             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], | ||||
|             Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], | ||||
|             Iterable[Candidate]] | ||||
|             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. | ||||
|         generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. | ||||
|         overwrite (bool): Whether to overwrite existing non-empty annotations. | ||||
|         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. | ||||
|         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another | ||||
|  | @ -219,6 +207,7 @@ class EntityLinker(TrainablePipe): | |||
|         candidates_batch_size (int): Size of batches for entity candidate generation. | ||||
|         threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the | ||||
|             threshold, prediction is discarded. If None, predictions are not filtered by any threshold. | ||||
|         save_activations (bool): save model activations in Doc when annotating. | ||||
|         DOCS: https://spacy.io/api/entitylinker#init | ||||
|         """ | ||||
| 
 | ||||
|  | @ -235,6 +224,7 @@ class EntityLinker(TrainablePipe): | |||
|         self.model = model | ||||
|         self.name = name | ||||
|         self.labels_discard = list(labels_discard) | ||||
|         # how many neighbour sentences to take into account | ||||
|         self.n_sents = n_sents | ||||
|         self.incl_prior = incl_prior | ||||
|         self.incl_context = incl_context | ||||
|  | @ -242,9 +232,7 @@ class EntityLinker(TrainablePipe): | |||
|         self.get_candidates_batch = get_candidates_batch | ||||
|         self.cfg: Dict[str, Any] = {"overwrite": overwrite} | ||||
|         self.distance = CosineDistance(normalize=False) | ||||
|         # how many neighbour sentences to take into account | ||||
|         # create an empty KB by default | ||||
|         self.kb = empty_kb(entity_vector_length)(self.vocab) | ||||
|         self.kb = generate_empty_kb(self.vocab, entity_vector_length) | ||||
|         self.scorer = scorer | ||||
|         self.use_gold_ents = use_gold_ents | ||||
|         self.candidates_batch_size = candidates_batch_size | ||||
|  | @ -253,6 +241,8 @@ class EntityLinker(TrainablePipe): | |||
| 
 | ||||
|         if candidates_batch_size < 1: | ||||
|             raise ValueError(Errors.E1044) | ||||
|         if self.incl_prior and not self.kb.supports_prior_probs: | ||||
|             warnings.warn(Warnings.W401) | ||||
| 
 | ||||
|     def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): | ||||
|         """Define the KB of this pipe by providing a function that will | ||||
|  | @ -266,7 +256,7 @@ class EntityLinker(TrainablePipe): | |||
|         # Raise an error if the knowledge base is not initialized. | ||||
|         if self.kb is None: | ||||
|             raise ValueError(Errors.E1018.format(name=self.name)) | ||||
|         if len(self.kb) == 0: | ||||
|         if hasattr(self.kb, "is_empty") and self.kb.is_empty(): | ||||
|             raise ValueError(Errors.E139.format(name=self.name)) | ||||
| 
 | ||||
|     def initialize( | ||||
|  | @ -485,7 +475,8 @@ class EntityLinker(TrainablePipe): | |||
| 
 | ||||
|                 batch_candidates = list( | ||||
|                     self.get_candidates_batch( | ||||
|                         self.kb, [ent_batch[idx] for idx in valid_ent_idx] | ||||
|                         self.kb, | ||||
|                         SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]), | ||||
|                     ) | ||||
|                     if self.candidates_batch_size > 1 | ||||
|                     else [ | ||||
|  | @ -535,18 +526,19 @@ class EntityLinker(TrainablePipe): | |||
|                             ) | ||||
|                         elif len(candidates) == 1 and self.threshold is None: | ||||
|                             # shortcut for efficiency reasons: take the 1 candidate | ||||
|                             final_kb_ids.append(candidates[0].entity_) | ||||
|                             final_kb_ids.append(candidates[0].entity_id_) | ||||
|                             self._add_activations( | ||||
|                                 doc_scores=doc_scores, | ||||
|                                 doc_ents=doc_ents, | ||||
|                                 scores=[1.0], | ||||
|                                 ents=[candidates[0].entity_], | ||||
|                                 ents=[candidates[0].entity_id], | ||||
|                             ) | ||||
|                         else: | ||||
|                             random.shuffle(candidates) | ||||
|                             # set all prior probabilities to 0 if incl_prior=False | ||||
|                             prior_probs = xp.asarray([c.prior_prob for c in candidates]) | ||||
|                             if not self.incl_prior: | ||||
|                             if self.incl_prior and self.kb.supports_prior_probs: | ||||
|                                 prior_probs = xp.asarray([c.prior_prob for c in candidates])  # type: ignore | ||||
|                             else: | ||||
|                                 prior_probs = xp.asarray([0.0 for _ in candidates]) | ||||
|                             scores = prior_probs | ||||
|                             # add in similarity from the context | ||||
|  | @ -570,7 +562,7 @@ class EntityLinker(TrainablePipe): | |||
|                                     raise ValueError(Errors.E161) | ||||
|                                 scores = prior_probs + sims - (prior_probs * sims) | ||||
|                             final_kb_ids.append( | ||||
|                                 candidates[scores.argmax().item()].entity_ | ||||
|                                 candidates[scores.argmax().item()].entity_id_ | ||||
|                                 if self.threshold is None | ||||
|                                 or scores.max() >= self.threshold | ||||
|                                 else EntityLinker.NIL | ||||
|  | @ -579,7 +571,7 @@ class EntityLinker(TrainablePipe): | |||
|                                 doc_scores=doc_scores, | ||||
|                                 doc_ents=doc_ents, | ||||
|                                 scores=scores, | ||||
|                                 ents=[c.entity for c in candidates], | ||||
|                                 ents=[c.entity_id for c in candidates], | ||||
|                             ) | ||||
|             self._add_doc_activations( | ||||
|                 docs_scores=docs_scores, | ||||
|  |  | |||
|  | @ -167,7 +167,7 @@ class Lemmatizer(Pipe): | |||
|             missing_tables = set(required_tables) - set(lookups.tables) | ||||
|             if len(missing_tables) > 0: | ||||
|                 raise ValueError( | ||||
|                     Errors.E4005.format( | ||||
|                     Errors.E4007.format( | ||||
|                         missing_tables=list(missing_tables), | ||||
|                         pipe_name=self.name, | ||||
|                         required_tables=srsly.json_dumps(required_tables), | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any | ||||
| from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple | ||||
| from thinc.api import Model, set_dropout_rate, Optimizer, Config | ||||
| from thinc.types import Floats2d | ||||
| from itertools import islice | ||||
| 
 | ||||
| from .trainable_pipe import TrainablePipe | ||||
|  | @ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe): | |||
| 
 | ||||
|         DOCS: https://spacy.io/api/tok2vec#update | ||||
|         """ | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         validate_examples(examples, "Tok2Vec.update") | ||||
|         docs = [eg.predicted for eg in examples] | ||||
|         set_dropout_rate(self.model, drop) | ||||
|         tokvecs, bp_tokvecs = self.model.begin_update(docs) | ||||
|         d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] | ||||
|         losses.setdefault(self.name, 0.0) | ||||
| 
 | ||||
|         def accumulate_gradient(one_d_tokvecs): | ||||
|             """Accumulate tok2vec loss and gradient. This is passed as a callback | ||||
|             to all but the last listener. Only the last one does the backprop. | ||||
|             """ | ||||
|             nonlocal d_tokvecs | ||||
|             for i in range(len(one_d_tokvecs)): | ||||
|                 d_tokvecs[i] += one_d_tokvecs[i] | ||||
|                 losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) | ||||
|             return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] | ||||
| 
 | ||||
|         def backprop(one_d_tokvecs): | ||||
|             """Callback to actually do the backprop. Passed to last listener.""" | ||||
|             accumulate_gradient(one_d_tokvecs) | ||||
|             d_docs = bp_tokvecs(d_tokvecs) | ||||
|             if sgd is not None: | ||||
|                 self.finish_update(sgd) | ||||
|             return d_docs | ||||
| 
 | ||||
|         batch_id = Tok2VecListener.get_batch_id(docs) | ||||
|         for listener in self.listeners[:-1]: | ||||
|             listener.receive(batch_id, tokvecs, accumulate_gradient) | ||||
|         if self.listeners: | ||||
|             self.listeners[-1].receive(batch_id, tokvecs, backprop) | ||||
|         return losses | ||||
|         return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses) | ||||
| 
 | ||||
|     def get_loss(self, examples, scores) -> None: | ||||
|         pass | ||||
|  | @ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe): | |||
|     def add_label(self, label): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def distill( | ||||
|         self, | ||||
|         teacher_pipe: Optional["TrainablePipe"], | ||||
|         examples: Iterable["Example"], | ||||
|         *, | ||||
|         drop: float = 0.0, | ||||
|         sgd: Optional[Optimizer] = None, | ||||
|         losses: Optional[Dict[str, float]] = None, | ||||
|     ) -> Dict[str, float]: | ||||
|         """Performs an update of the student pipe's model using the | ||||
|         student's distillation examples and sets the annotations | ||||
|         of the teacher's distillation examples using the teacher pipe. | ||||
| 
 | ||||
|         teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use | ||||
|             for prediction. | ||||
|         examples (Iterable[Example]): Distillation examples. The reference (teacher) | ||||
|             and predicted (student) docs must have the same number of tokens and the | ||||
|             same orthography. | ||||
|         drop (float): dropout rate. | ||||
|         sgd (Optional[Optimizer]): An optimizer. Will be created via | ||||
|             create_optimizer if not set. | ||||
|         losses (Optional[Dict[str, float]]): Optional record of loss during | ||||
|             distillation. | ||||
|         RETURNS: The updated losses dictionary. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/tok2vec#distill | ||||
|         """ | ||||
|         # By default we require a teacher pipe, but there are downstream | ||||
|         # implementations that don't require a pipe. | ||||
|         if teacher_pipe is None: | ||||
|             raise ValueError(Errors.E4002.format(name=self.name)) | ||||
|         teacher_docs = [eg.reference for eg in examples] | ||||
|         student_docs = [eg.predicted for eg in examples] | ||||
|         teacher_preds = teacher_pipe.predict(teacher_docs) | ||||
|         teacher_pipe.set_annotations(teacher_docs, teacher_preds) | ||||
|         return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses) | ||||
| 
 | ||||
|     def _update_with_docs( | ||||
|         self, | ||||
|         docs: Iterable[Doc], | ||||
|         *, | ||||
|         drop: float = 0.0, | ||||
|         sgd: Optional[Optimizer] = None, | ||||
|         losses: Optional[Dict[str, float]] = None, | ||||
|     ): | ||||
|         if losses is None: | ||||
|             losses = {} | ||||
|         losses.setdefault(self.name, 0.0) | ||||
|         set_dropout_rate(self.model, drop) | ||||
| 
 | ||||
|         tokvecs, accumulate_gradient, backprop = self._create_backprops( | ||||
|             docs, losses, sgd=sgd | ||||
|         ) | ||||
|         batch_id = Tok2VecListener.get_batch_id(docs) | ||||
|         for listener in self.listeners[:-1]: | ||||
|             listener.receive(batch_id, tokvecs, accumulate_gradient) | ||||
|         if self.listeners: | ||||
|             self.listeners[-1].receive(batch_id, tokvecs, backprop) | ||||
|         return losses | ||||
| 
 | ||||
|     def _create_backprops( | ||||
|         self, | ||||
|         docs: Iterable[Doc], | ||||
|         losses: Dict[str, float], | ||||
|         *, | ||||
|         sgd: Optional[Optimizer] = None, | ||||
|     ) -> Tuple[Floats2d, Callable, Callable]: | ||||
|         tokvecs, bp_tokvecs = self.model.begin_update(docs) | ||||
|         d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] | ||||
| 
 | ||||
|         def accumulate_gradient(one_d_tokvecs): | ||||
|             """Accumulate tok2vec loss and gradient. This is passed as a callback | ||||
|             to all but the last listener. Only the last one does the backprop. | ||||
|             """ | ||||
|             nonlocal d_tokvecs | ||||
|             for i in range(len(one_d_tokvecs)): | ||||
|                 d_tokvecs[i] += one_d_tokvecs[i] | ||||
|                 losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) | ||||
|             return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] | ||||
| 
 | ||||
|         def backprop(one_d_tokvecs): | ||||
|             """Callback to actually do the backprop. Passed to last listener.""" | ||||
|             accumulate_gradient(one_d_tokvecs) | ||||
|             d_docs = bp_tokvecs(d_tokvecs) | ||||
|             if sgd is not None: | ||||
|                 self.finish_update(sgd) | ||||
|             return d_docs | ||||
| 
 | ||||
|         return tokvecs, accumulate_gradient, backprop | ||||
| 
 | ||||
| 
 | ||||
| class Tok2VecListener(Model): | ||||
|     """A layer that gets fed its answers from an upstream connection, | ||||
|  |  | |||
|  | @ -36,6 +36,11 @@ from ..errors import Errors, Warnings | |||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| # TODO: Remove when we switch to Cython 3. | ||||
| cdef extern from "<algorithm>" namespace "std" nogil: | ||||
|     bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + | ||||
| 
 | ||||
| 
 | ||||
| NUMPY_OPS = NumpyOps() | ||||
| 
 | ||||
| 
 | ||||
|  | @ -253,8 +258,8 @@ class Parser(TrainablePipe): | |||
|             # batch uniform length. Since we do not have a gold standard | ||||
|             # sequence, we use the teacher's predictions as the gold | ||||
|             # standard. | ||||
|             max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) | ||||
|             states = self._init_batch(teacher_pipe, student_docs, max_moves) | ||||
|             max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) | ||||
|             states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves) | ||||
|         else: | ||||
|             states = self.moves.init_batch(student_docs) | ||||
| 
 | ||||
|  | @ -265,12 +270,12 @@ class Parser(TrainablePipe): | |||
|         # gradients of the student's transition distributions relative to the | ||||
|         # teacher's distributions. | ||||
| 
 | ||||
|         student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves, | ||||
|             max_moves=max_moves) | ||||
|         student_inputs = TransitionModelInputs(docs=student_docs, | ||||
|             states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) | ||||
|         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) | ||||
|         actions = states2actions(student_states) | ||||
|         actions = _states_diff_to_actions(states, student_states) | ||||
|         teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], | ||||
|             moves=self.moves, actions=actions) | ||||
|             states=states, moves=teacher_pipe.moves, actions=actions) | ||||
|         (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) | ||||
| 
 | ||||
|         loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) | ||||
|  | @ -522,7 +527,7 @@ class Parser(TrainablePipe): | |||
|         set_dropout_rate(self.model, 0.0) | ||||
|         student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) | ||||
|         (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) | ||||
|         actions = states2actions(student_states) | ||||
|         actions = _states_to_actions(student_states) | ||||
|         teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) | ||||
|         _, teacher_scores = self._rehearsal_model.predict(teacher_inputs) | ||||
| 
 | ||||
|  | @ -642,7 +647,7 @@ class Parser(TrainablePipe): | |||
|                     raise ValueError(Errors.E149) from None | ||||
|         return self | ||||
| 
 | ||||
|     def _init_batch(self, teacher_step_model, docs, max_length): | ||||
|     def _init_batch_from_teacher(self, teacher_pipe, docs, max_length): | ||||
|         """Make a square batch of length equal to the shortest transition | ||||
|         sequence or a cap. A long | ||||
|         doc will get multiple states. Let's say we have a doc of length 2*N, | ||||
|  | @ -651,10 +656,12 @@ class Parser(TrainablePipe): | |||
|         _init_gold_batch, this version uses a teacher model to generate the | ||||
|         cut sequences.""" | ||||
|         cdef: | ||||
|             StateClass start_state | ||||
|             StateClass state | ||||
|             Transition action | ||||
|         all_states = self.moves.init_batch(docs) | ||||
|             TransitionSystem moves = teacher_pipe.moves | ||||
| 
 | ||||
|         # Start with the same heuristic as in supervised training: exclude | ||||
|         # docs that are within the maximum length. | ||||
|         all_states = moves.init_batch(docs) | ||||
|         states = [] | ||||
|         to_cut = [] | ||||
|         for state, doc in zip(all_states, docs): | ||||
|  | @ -663,18 +670,28 @@ class Parser(TrainablePipe): | |||
|                     states.append(state) | ||||
|                 else: | ||||
|                     to_cut.append(state) | ||||
| 
 | ||||
|         if not to_cut: | ||||
|             return states | ||||
| 
 | ||||
|         # Parse the states that are too long with the teacher's parsing model. | ||||
|         teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, | ||||
|             states=[state.copy() for state in to_cut]) | ||||
|         (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs) | ||||
| 
 | ||||
|         # Step through the teacher's actions and store every state after | ||||
|         # each multiple of max_length. | ||||
|         teacher_actions = _states_to_actions(teacher_states) | ||||
|         while to_cut: | ||||
|             states.extend(state.copy() for state in to_cut) | ||||
|             # Move states forward max_length actions. | ||||
|             length = 0 | ||||
|             while to_cut and length < max_length: | ||||
|                 teacher_scores = teacher_step_model.predict(to_cut) | ||||
|                 self.transition_states(to_cut, teacher_scores) | ||||
|                 # States that are completed do not need further cutting. | ||||
|                 to_cut = [state for state in to_cut if not state.is_final()] | ||||
|                 length += 1 | ||||
|         return states | ||||
|             for step_actions in teacher_actions[:max_length]: | ||||
|                 to_cut = moves.apply_actions(to_cut, step_actions) | ||||
|             teacher_actions = teacher_actions[max_length:] | ||||
| 
 | ||||
|             if len(teacher_actions) < max_length: | ||||
|                 break | ||||
| 
 | ||||
|         return states | ||||
| 
 | ||||
|     def _init_gold_batch(self, examples, max_length): | ||||
|         """Make a square batch, of length equal to the shortest transition | ||||
|  | @ -736,7 +753,7 @@ def _change_attrs(model, **kwargs): | |||
|             model.attrs[key] = value | ||||
| 
 | ||||
| 
 | ||||
| def states2actions(states: List[StateClass]) -> List[Ints1d]: | ||||
| def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: | ||||
|     cdef int step | ||||
|     cdef StateClass state | ||||
|     cdef StateC* c_state | ||||
|  | @ -757,3 +774,45 @@ def states2actions(states: List[StateClass]) -> List[Ints1d]: | |||
|         actions.append(numpy.array(step_actions, dtype="i")) | ||||
| 
 | ||||
|     return actions | ||||
| 
 | ||||
| def _states_diff_to_actions( | ||||
|     before_states: List[StateClass], | ||||
|     after_states: List[StateClass] | ||||
| ) -> List[Ints1d]: | ||||
|     """ | ||||
|     Return for two sets of states the actions to go from the first set of | ||||
|     states to the second set of states. The histories of the first set of | ||||
|     states must be a prefix of the second set of states. | ||||
|     """ | ||||
|     cdef StateClass before_state, after_state | ||||
|     cdef StateC* c_state_before | ||||
|     cdef StateC* c_state_after | ||||
| 
 | ||||
|     assert len(before_states) == len(after_states) | ||||
| 
 | ||||
|     # Check invariant: before states histories must be prefixes of after states. | ||||
|     for before_state, after_state in zip(before_states, after_states): | ||||
|         c_state_before = before_state.c | ||||
|         c_state_after = after_state.c | ||||
| 
 | ||||
|         assert equal(c_state_before.history.begin(), c_state_before.history.end(), | ||||
|             c_state_after.history.begin()) | ||||
| 
 | ||||
|     actions = [] | ||||
|     while True: | ||||
|         step = len(actions) | ||||
| 
 | ||||
|         step_actions = [] | ||||
|         for before_state, after_state in zip(before_states, after_states): | ||||
|             c_state_before = before_state.c | ||||
|             c_state_after = after_state.c | ||||
|             if step < c_state_after.history.size() - c_state_before.history.size(): | ||||
|                 step_actions.append(c_state_after.history[c_state_before.history.size() + step]) | ||||
| 
 | ||||
|         # We are done if we have exhausted all histories. | ||||
|         if len(step_actions) == 0: | ||||
|             break | ||||
| 
 | ||||
|         actions.append(numpy.array(step_actions, dtype="i")) | ||||
| 
 | ||||
|     return actions | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overlo | |||
| from pathlib import Path | ||||
| 
 | ||||
| class StringStore: | ||||
|     def __init__(self, strings: Optional[Iterable[str]]) -> None: ... | ||||
|     def __init__(self, strings: Optional[Iterable[str]] = None) -> None: ... | ||||
|     @overload | ||||
|     def __getitem__(self, string_or_hash: str) -> int: ... | ||||
|     @overload | ||||
|  |  | |||
|  | @ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): | |||
| def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): | ||||
|     tokens = sv_tokenizer(text) | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(12311) | ||||
| @pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"]) | ||||
| def test_sv_tokenizer_handles_colon(sv_tokenizer, text): | ||||
|     tokens = sv_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
|  |  | |||
|  | @ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): | |||
|         ("the", "brown", "$--", 0), | ||||
|         ("brown", "the", "$--", 1), | ||||
|         ("brown", "brown", "$--", 0), | ||||
|         ("over", "jumped", "<+", 0), | ||||
|         ("quick", "fox", "<+", 0), | ||||
|         ("the", "quick", "<+", 0), | ||||
|         ("brown", "fox", "<+", 1), | ||||
|         ("quick", "fox", "<++", 1), | ||||
|         ("quick", "over", "<++", 0), | ||||
|         ("over", "jumped", "<++", 0), | ||||
|         ("the", "fox", "<++", 2), | ||||
|         ("brown", "fox", "<-", 0), | ||||
|         ("fox", "over", "<-", 0), | ||||
|         ("the", "over", "<-", 0), | ||||
|         ("over", "jumped", "<-", 1), | ||||
|         ("brown", "fox", "<--", 0), | ||||
|         ("fox", "jumped", "<--", 0), | ||||
|         ("fox", "over", "<--", 1), | ||||
|         ("fox", "brown", ">+", 0), | ||||
|         ("over", "fox", ">+", 0), | ||||
|         ("over", "the", ">+", 0), | ||||
|         ("jumped", "over", ">+", 1), | ||||
|         ("jumped", "over", ">++", 1), | ||||
|         ("fox", "lazy", ">++", 0), | ||||
|         ("over", "the", ">++", 0), | ||||
|         ("jumped", "over", ">-", 0), | ||||
|         ("fox", "quick", ">-", 0), | ||||
|         ("brown", "quick", ">-", 0), | ||||
|         ("fox", "brown", ">-", 1), | ||||
|         ("brown", "fox", ">--", 0), | ||||
|         ("fox", "brown", ">--", 1), | ||||
|         ("jumped", "fox", ">--", 1), | ||||
|  |  | |||
							
								
								
									
										61
									
								
								spacy/tests/parser/test_model.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								spacy/tests/parser/test_model.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,61 @@ | |||
| import numpy | ||||
| import pytest | ||||
| 
 | ||||
| from spacy.lang.en import English | ||||
| from spacy.ml.tb_framework import TransitionModelInputs | ||||
| from spacy.training import Example | ||||
| 
 | ||||
| TRAIN_DATA = [ | ||||
|     ( | ||||
|         "They trade mortgage-backed securities.", | ||||
|         { | ||||
|             "heads": [1, 1, 4, 4, 5, 1, 1], | ||||
|             "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], | ||||
|         }, | ||||
|     ), | ||||
|     ( | ||||
|         "I like London and Berlin.", | ||||
|         { | ||||
|             "heads": [1, 1, 1, 2, 2, 1], | ||||
|             "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], | ||||
|         }, | ||||
|     ), | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def nlp_parser(): | ||||
|     nlp = English() | ||||
|     parser = nlp.add_pipe("parser") | ||||
| 
 | ||||
|     train_examples = [] | ||||
|     for text, annotations in TRAIN_DATA: | ||||
|         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) | ||||
|         for dep in annotations["deps"]: | ||||
|             parser.add_label(dep) | ||||
|     nlp.initialize() | ||||
| 
 | ||||
|     return nlp, parser | ||||
| 
 | ||||
| 
 | ||||
| def test_incorrect_number_of_actions(nlp_parser): | ||||
|     nlp, parser = nlp_parser | ||||
|     doc = nlp.make_doc("test") | ||||
| 
 | ||||
|     # Too many actions for the number of docs | ||||
|     with pytest.raises(AssertionError): | ||||
|         parser.model.predict( | ||||
|             TransitionModelInputs( | ||||
|                 docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")] | ||||
|             ) | ||||
|         ) | ||||
| 
 | ||||
|     # Too few actions for the number of docs | ||||
|     with pytest.raises(AssertionError): | ||||
|         parser.model.predict( | ||||
|             TransitionModelInputs( | ||||
|                 docs=[doc, doc], | ||||
|                 moves=parser.moves, | ||||
|                 actions=[numpy.array([0], dtype="i")], | ||||
|             ) | ||||
|         ) | ||||
|  | @ -623,7 +623,9 @@ def test_is_distillable(): | |||
|     assert ner.is_distillable | ||||
| 
 | ||||
| 
 | ||||
| def test_distill(): | ||||
| @pytest.mark.slow | ||||
| @pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) | ||||
| def test_distill(max_moves): | ||||
|     teacher = English() | ||||
|     teacher_ner = teacher.add_pipe("ner") | ||||
|     train_examples = [] | ||||
|  | @ -641,6 +643,7 @@ def test_distill(): | |||
| 
 | ||||
|     student = English() | ||||
|     student_ner = student.add_pipe("ner") | ||||
|     student_ner.cfg["update_with_oracle_cut_size"] = max_moves | ||||
|     student_ner.initialize( | ||||
|         get_examples=lambda: train_examples, labels=teacher_ner.label_data | ||||
|     ) | ||||
|  |  | |||
|  | @ -463,7 +463,9 @@ def test_is_distillable(): | |||
|     assert parser.is_distillable | ||||
| 
 | ||||
| 
 | ||||
| def test_distill(): | ||||
| @pytest.mark.slow | ||||
| @pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) | ||||
| def test_distill(max_moves): | ||||
|     teacher = English() | ||||
|     teacher_parser = teacher.add_pipe("parser") | ||||
|     train_examples = [] | ||||
|  | @ -481,6 +483,7 @@ def test_distill(): | |||
| 
 | ||||
|     student = English() | ||||
|     student_parser = student.add_pipe("parser") | ||||
|     student_parser.cfg["update_with_oracle_cut_size"] = max_moves | ||||
|     student_parser.initialize( | ||||
|         get_examples=lambda: train_examples, labels=teacher_parser.label_data | ||||
|     ) | ||||
|  |  | |||
|  | @ -7,10 +7,10 @@ from thinc.types import Ragged | |||
| from spacy import registry, util | ||||
| from spacy.attrs import ENT_KB_ID | ||||
| from spacy.compat import pickle | ||||
| from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase | ||||
| from spacy.kb import Candidate, InMemoryLookupKB, KnowledgeBase | ||||
| from spacy.lang.en import English | ||||
| from spacy.ml import load_kb | ||||
| from spacy.ml.models.entity_linker import build_span_maker | ||||
| from spacy.ml.models.entity_linker import build_span_maker, get_candidates | ||||
| from spacy.pipeline import EntityLinker, TrainablePipe | ||||
| from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL | ||||
| from spacy.scorer import Scorer | ||||
|  | @ -353,6 +353,9 @@ def test_kb_default(nlp): | |||
|     """Test that the default (empty) KB is loaded upon construction""" | ||||
|     entity_linker = nlp.add_pipe("entity_linker", config={}) | ||||
|     assert len(entity_linker.kb) == 0 | ||||
|     with pytest.raises(ValueError, match="E139"): | ||||
|         # this raises an error because the KB is empty | ||||
|         entity_linker.validate_kb() | ||||
|     assert entity_linker.kb.get_size_entities() == 0 | ||||
|     assert entity_linker.kb.get_size_aliases() == 0 | ||||
|     # 64 is the default value from pipeline.entity_linker | ||||
|  | @ -462,16 +465,17 @@ def test_candidate_generation(nlp): | |||
|     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) | ||||
| 
 | ||||
|     # test the size of the relevant candidates | ||||
|     adam_ent_cands = get_candidates(mykb, adam_ent) | ||||
|     assert len(get_candidates(mykb, douglas_ent)) == 2 | ||||
|     assert len(get_candidates(mykb, adam_ent)) == 1 | ||||
|     assert len(adam_ent_cands) == 1 | ||||
|     assert len(get_candidates(mykb, Adam_ent)) == 0  # default case sensitive | ||||
|     assert len(get_candidates(mykb, shrubbery_ent)) == 0 | ||||
| 
 | ||||
|     # test the content of the candidates | ||||
|     assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" | ||||
|     assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" | ||||
|     assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) | ||||
|     assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) | ||||
|     assert adam_ent_cands[0].entity_id_ == "Q2" | ||||
|     assert adam_ent_cands[0].alias == "adam" | ||||
|     assert_almost_equal(adam_ent_cands[0].entity_freq, 12) | ||||
|     assert_almost_equal(adam_ent_cands[0].prior_prob, 0.9) | ||||
| 
 | ||||
| 
 | ||||
| def test_el_pipe_configuration(nlp): | ||||
|  | @ -499,7 +503,7 @@ def test_el_pipe_configuration(nlp): | |||
|     assert doc[2].ent_kb_id_ == "Q2" | ||||
| 
 | ||||
|     def get_lowercased_candidates(kb, span): | ||||
|         return kb.get_alias_candidates(span.text.lower()) | ||||
|         return kb._get_alias_candidates(span.text.lower()) | ||||
| 
 | ||||
|     def get_lowercased_candidates_batch(kb, spans): | ||||
|         return [get_lowercased_candidates(kb, span) for span in spans] | ||||
|  | @ -558,24 +562,22 @@ def test_vocab_serialization(nlp): | |||
|     mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1]) | ||||
|     adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) | ||||
| 
 | ||||
|     candidates = mykb.get_alias_candidates("adam") | ||||
|     candidates = mykb._get_alias_candidates("adam") | ||||
|     assert len(candidates) == 1 | ||||
|     assert candidates[0].entity == q2_hash | ||||
|     assert candidates[0].entity_ == "Q2" | ||||
|     assert candidates[0].alias == adam_hash | ||||
|     assert candidates[0].alias_ == "adam" | ||||
|     assert candidates[0].entity_id == q2_hash | ||||
|     assert candidates[0].entity_id_ == "Q2" | ||||
|     assert candidates[0].alias == "adam" | ||||
| 
 | ||||
|     with make_tempdir() as d: | ||||
|         mykb.to_disk(d / "kb") | ||||
|         kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1) | ||||
|         kb_new_vocab.from_disk(d / "kb") | ||||
| 
 | ||||
|         candidates = kb_new_vocab.get_alias_candidates("adam") | ||||
|         candidates = kb_new_vocab._get_alias_candidates("adam") | ||||
|         assert len(candidates) == 1 | ||||
|         assert candidates[0].entity == q2_hash | ||||
|         assert candidates[0].entity_ == "Q2" | ||||
|         assert candidates[0].alias == adam_hash | ||||
|         assert candidates[0].alias_ == "adam" | ||||
|         assert candidates[0].entity_id == q2_hash | ||||
|         assert candidates[0].entity_id_ == "Q2" | ||||
|         assert candidates[0].alias == "adam" | ||||
| 
 | ||||
|         assert kb_new_vocab.get_vector("Q2") == [2] | ||||
|         assert_almost_equal(kb_new_vocab.get_prior_prob("Q2", "douglas"), 0.4) | ||||
|  | @ -595,20 +597,20 @@ def test_append_alias(nlp): | |||
|     mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) | ||||
| 
 | ||||
|     # test the size of the relevant candidates | ||||
|     assert len(mykb.get_alias_candidates("douglas")) == 2 | ||||
|     assert len(mykb._get_alias_candidates("douglas")) == 2 | ||||
| 
 | ||||
|     # append an alias | ||||
|     mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) | ||||
| 
 | ||||
|     # test the size of the relevant candidates has been incremented | ||||
|     assert len(mykb.get_alias_candidates("douglas")) == 3 | ||||
|     assert len(mykb._get_alias_candidates("douglas")) == 3 | ||||
| 
 | ||||
|     # append the same alias-entity pair again should not work (will throw a warning) | ||||
|     with pytest.warns(UserWarning): | ||||
|         mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) | ||||
| 
 | ||||
|     # test the size of the relevant candidates remained unchanged | ||||
|     assert len(mykb.get_alias_candidates("douglas")) == 3 | ||||
|     assert len(mykb._get_alias_candidates("douglas")) == 3 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore:\\[W036") | ||||
|  | @ -905,11 +907,11 @@ def test_kb_to_bytes(): | |||
|     assert kb_2.contains_alias("Russ Cochran") | ||||
|     assert kb_1.get_size_aliases() == kb_2.get_size_aliases() | ||||
|     assert kb_1.get_alias_strings() == kb_2.get_alias_strings() | ||||
|     assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( | ||||
|         kb_2.get_alias_candidates("Russ Cochran") | ||||
|     assert len(kb_1._get_alias_candidates("Russ Cochran")) == len( | ||||
|         kb_2._get_alias_candidates("Russ Cochran") | ||||
|     ) | ||||
|     assert len(kb_1.get_alias_candidates("Randomness")) == len( | ||||
|         kb_2.get_alias_candidates("Randomness") | ||||
|     assert len(kb_1._get_alias_candidates("Randomness")) == len( | ||||
|         kb_2._get_alias_candidates("Randomness") | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -990,14 +992,11 @@ def test_scorer_links(): | |||
| @pytest.mark.parametrize( | ||||
|     "name,config", | ||||
|     [ | ||||
|         ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), | ||||
|         ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), | ||||
|     ], | ||||
| ) | ||||
| # fmt: on | ||||
| def test_legacy_architectures(name, config): | ||||
|     from spacy_legacy.components.entity_linker import EntityLinker_v1 | ||||
| 
 | ||||
|     # Ensure that the legacy architectures still work | ||||
|     vector_length = 3 | ||||
|     nlp = English() | ||||
|  | @ -1019,10 +1018,7 @@ def test_legacy_architectures(name, config): | |||
|         return mykb | ||||
| 
 | ||||
|     entity_linker = nlp.add_pipe(name, config={"model": config}) | ||||
|     if config["@architectures"] == "spacy.EntityLinker.v1": | ||||
|         assert isinstance(entity_linker, EntityLinker_v1) | ||||
|     else: | ||||
|         assert isinstance(entity_linker, EntityLinker) | ||||
|     assert isinstance(entity_linker, EntityLinker) | ||||
|     entity_linker.set_kb(create_kb) | ||||
|     optimizer = nlp.initialize(get_examples=lambda: train_examples) | ||||
| 
 | ||||
|  |  | |||
|  | @ -9,6 +9,7 @@ from spacy.lang.en import English | |||
| from spacy.lang.en.syntax_iterators import noun_chunks | ||||
| from spacy.language import Language | ||||
| from spacy.pipeline import TrainablePipe | ||||
| from spacy.strings import StringStore | ||||
| from spacy.tokens import Doc | ||||
| from spacy.training import Example | ||||
| from spacy.util import SimpleFrozenList, get_arg_names, make_tempdir | ||||
|  | @ -131,7 +132,7 @@ def test_issue5458(): | |||
|     # Test that the noun chuncker does not generate overlapping spans | ||||
|     # fmt: off | ||||
|     words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] | ||||
|     vocab = Vocab(strings=words) | ||||
|     vocab = Vocab(strings=StringStore(words)) | ||||
|     deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] | ||||
|     pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] | ||||
|     heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] | ||||
|  |  | |||
|  | @ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat(): | |||
|     assert cats1["imperative"] < 0.9 | ||||
|     assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] | ||||
|     assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] | ||||
| 
 | ||||
| 
 | ||||
| cfg_string_distillation = """ | ||||
|     [nlp] | ||||
|     lang = "en" | ||||
|     pipeline = ["tok2vec","tagger"] | ||||
| 
 | ||||
|     [components] | ||||
| 
 | ||||
|     [components.tagger] | ||||
|     factory = "tagger" | ||||
| 
 | ||||
|     [components.tagger.model] | ||||
|     @architectures = "spacy.Tagger.v2" | ||||
|     nO = null | ||||
| 
 | ||||
|     [components.tagger.model.tok2vec] | ||||
|     @architectures = "spacy.Tok2VecListener.v1" | ||||
|     width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
|     [components.tok2vec] | ||||
|     factory = "tok2vec" | ||||
| 
 | ||||
|     [components.tok2vec.model] | ||||
|     @architectures = "spacy.Tok2Vec.v2" | ||||
| 
 | ||||
|     [components.tok2vec.model.embed] | ||||
|     @architectures = "spacy.MultiHashEmbed.v2" | ||||
|     width = ${components.tok2vec.model.encode.width} | ||||
|     rows = [2000, 1000, 1000, 1000] | ||||
|     attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] | ||||
|     include_static_vectors = false | ||||
| 
 | ||||
|     [components.tok2vec.model.encode] | ||||
|     @architectures = "spacy.MaxoutWindowEncoder.v2" | ||||
|     width = 96 | ||||
|     depth = 4 | ||||
|     window_size = 1 | ||||
|     maxout_pieces = 3 | ||||
|     """ | ||||
| 
 | ||||
| 
 | ||||
| def test_tok2vec_distillation_teacher_annotations(): | ||||
|     orig_config = Config().from_str(cfg_string_distillation) | ||||
|     teacher_nlp = util.load_model_from_config( | ||||
|         orig_config, auto_fill=True, validate=True | ||||
|     ) | ||||
|     student_nlp = util.load_model_from_config( | ||||
|         orig_config, auto_fill=True, validate=True | ||||
|     ) | ||||
| 
 | ||||
|     train_examples_teacher = [] | ||||
|     train_examples_student = [] | ||||
|     for t in TRAIN_DATA: | ||||
|         train_examples_teacher.append( | ||||
|             Example.from_dict(teacher_nlp.make_doc(t[0]), t[1]) | ||||
|         ) | ||||
|         train_examples_student.append( | ||||
|             Example.from_dict(student_nlp.make_doc(t[0]), t[1]) | ||||
|         ) | ||||
| 
 | ||||
|     optimizer = teacher_nlp.initialize(lambda: train_examples_teacher) | ||||
|     student_nlp.initialize(lambda: train_examples_student) | ||||
| 
 | ||||
|     # Since Language.distill creates a copy of the examples to use as | ||||
|     # its internal teacher/student docs, we'll need to monkey-patch the | ||||
|     # tok2vec pipe's distill method. | ||||
|     student_tok2vec = student_nlp.get_pipe("tok2vec") | ||||
|     student_tok2vec._old_distill = student_tok2vec.distill | ||||
| 
 | ||||
|     def tok2vec_distill_wrapper( | ||||
|         self, | ||||
|         teacher_pipe, | ||||
|         examples, | ||||
|         **kwargs, | ||||
|     ): | ||||
|         assert all(not eg.reference.tensor.any() for eg in examples) | ||||
|         out = self._old_distill(teacher_pipe, examples, **kwargs) | ||||
|         assert all(eg.reference.tensor.any() for eg in examples) | ||||
|         return out | ||||
| 
 | ||||
|     student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) | ||||
|     student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) | ||||
|  |  | |||
|  | @ -1,7 +1,10 @@ | |||
| from typing import Callable | ||||
| from pathlib import Path | ||||
| from typing import Callable, Iterable, Any, Dict | ||||
| 
 | ||||
| from spacy import util | ||||
| from spacy.util import ensure_path, registry, load_model_from_config | ||||
| import srsly | ||||
| 
 | ||||
| from spacy import util, Errors | ||||
| from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList | ||||
| from spacy.kb.kb_in_memory import InMemoryLookupKB | ||||
| from spacy.vocab import Vocab | ||||
| from thinc.api import Config | ||||
|  | @ -63,19 +66,21 @@ def _check_kb(kb): | |||
|         assert alias_string not in kb.get_alias_strings() | ||||
| 
 | ||||
|     # check candidates & probabilities | ||||
|     candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) | ||||
|     candidates = sorted( | ||||
|         kb._get_alias_candidates("double07"), key=lambda x: x.entity_id_ | ||||
|     ) | ||||
|     assert len(candidates) == 2 | ||||
| 
 | ||||
|     assert candidates[0].entity_ == "Q007" | ||||
|     assert candidates[0].entity_id_ == "Q007" | ||||
|     assert 6.999 < candidates[0].entity_freq < 7.01 | ||||
|     assert candidates[0].entity_vector == [0, 0, 7] | ||||
|     assert candidates[0].alias_ == "double07" | ||||
|     assert candidates[0].alias == "double07" | ||||
|     assert 0.899 < candidates[0].prior_prob < 0.901 | ||||
| 
 | ||||
|     assert candidates[1].entity_ == "Q17" | ||||
|     assert candidates[1].entity_id_ == "Q17" | ||||
|     assert 1.99 < candidates[1].entity_freq < 2.01 | ||||
|     assert candidates[1].entity_vector == [7, 1, 0] | ||||
|     assert candidates[1].alias_ == "double07" | ||||
|     assert candidates[1].alias == "double07" | ||||
|     assert 0.099 < candidates[1].prior_prob < 0.101 | ||||
| 
 | ||||
| 
 | ||||
|  | @ -91,7 +96,10 @@ def test_serialize_subclassed_kb(): | |||
| 
 | ||||
|     [components.entity_linker] | ||||
|     factory = "entity_linker" | ||||
| 
 | ||||
|      | ||||
|     [components.entity_linker.generate_empty_kb] | ||||
|     @misc = "kb_test.CustomEmptyKB.v1" | ||||
|      | ||||
|     [initialize] | ||||
| 
 | ||||
|     [initialize.components] | ||||
|  | @ -99,7 +107,7 @@ def test_serialize_subclassed_kb(): | |||
|     [initialize.components.entity_linker] | ||||
| 
 | ||||
|     [initialize.components.entity_linker.kb_loader] | ||||
|     @misc = "spacy.CustomKB.v1" | ||||
|     @misc = "kb_test.CustomKB.v1" | ||||
|     entity_vector_length = 342 | ||||
|     custom_field = 666 | ||||
|     """ | ||||
|  | @ -109,10 +117,57 @@ def test_serialize_subclassed_kb(): | |||
|             super().__init__(vocab, entity_vector_length) | ||||
|             self.custom_field = custom_field | ||||
| 
 | ||||
|     @registry.misc("spacy.CustomKB.v1") | ||||
|         def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): | ||||
|             """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" | ||||
|             path = ensure_path(path) | ||||
|             if not path.exists(): | ||||
|                 path.mkdir(parents=True) | ||||
|             if not path.is_dir(): | ||||
|                 raise ValueError(Errors.E928.format(loc=path)) | ||||
| 
 | ||||
|             def serialize_custom_fields(file_path: Path) -> None: | ||||
|                 srsly.write_json(file_path, {"custom_field": self.custom_field}) | ||||
| 
 | ||||
|             serialize = { | ||||
|                 "contents": lambda p: self.write_contents(p), | ||||
|                 "strings.json": lambda p: self.vocab.strings.to_disk(p), | ||||
|                 "custom_fields": lambda p: serialize_custom_fields(p), | ||||
|             } | ||||
|             util.to_disk(path, serialize, exclude) | ||||
| 
 | ||||
|         def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): | ||||
|             """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" | ||||
|             path = ensure_path(path) | ||||
|             if not path.exists(): | ||||
|                 raise ValueError(Errors.E929.format(loc=path)) | ||||
|             if not path.is_dir(): | ||||
|                 raise ValueError(Errors.E928.format(loc=path)) | ||||
| 
 | ||||
|             def deserialize_custom_fields(file_path: Path) -> None: | ||||
|                 self.custom_field = srsly.read_json(file_path)["custom_field"] | ||||
| 
 | ||||
|             deserialize: Dict[str, Callable[[Any], Any]] = { | ||||
|                 "contents": lambda p: self.read_contents(p), | ||||
|                 "strings.json": lambda p: self.vocab.strings.from_disk(p), | ||||
|                 "custom_fields": lambda p: deserialize_custom_fields(p), | ||||
|             } | ||||
|             util.from_disk(path, deserialize, exclude) | ||||
| 
 | ||||
|     @registry.misc("kb_test.CustomEmptyKB.v1") | ||||
|     def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: | ||||
|         def empty_kb_factory(vocab: Vocab, entity_vector_length: int): | ||||
|             return SubInMemoryLookupKB( | ||||
|                 vocab=vocab, | ||||
|                 entity_vector_length=entity_vector_length, | ||||
|                 custom_field=0, | ||||
|             ) | ||||
| 
 | ||||
|         return empty_kb_factory | ||||
| 
 | ||||
|     @registry.misc("kb_test.CustomKB.v1") | ||||
|     def custom_kb( | ||||
|         entity_vector_length: int, custom_field: int | ||||
|     ) -> Callable[[Vocab], InMemoryLookupKB]: | ||||
|     ) -> Callable[[Vocab], SubInMemoryLookupKB]: | ||||
|         def custom_kb_factory(vocab): | ||||
|             kb = SubInMemoryLookupKB( | ||||
|                 vocab=vocab, | ||||
|  | @ -139,6 +194,6 @@ def test_serialize_subclassed_kb(): | |||
|         nlp2 = util.load_model_from_path(tmp_dir) | ||||
|         entity_linker2 = nlp2.get_pipe("entity_linker") | ||||
|         # After IO, the KB is the standard one | ||||
|         assert type(entity_linker2.kb) == InMemoryLookupKB | ||||
|         assert type(entity_linker2.kb) == SubInMemoryLookupKB | ||||
|         assert entity_linker2.kb.entity_vector_length == 342 | ||||
|         assert not hasattr(entity_linker2.kb, "custom_field") | ||||
|         assert entity_linker2.kb.custom_field == 666 | ||||
|  |  | |||
|  | @ -13,8 +13,11 @@ from spacy.vocab import Vocab | |||
| 
 | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] | ||||
| test_strings_attrs = [(["rats", "are", "cute"], "Hello")] | ||||
| test_strings = [ | ||||
|     (StringStore(), StringStore()), | ||||
|     (StringStore(["rats", "are", "cute"]), StringStore(["i", "like", "rats"])), | ||||
| ] | ||||
| test_strings_attrs = [(StringStore(["rats", "are", "cute"]), "Hello")] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(599) | ||||
|  | @ -81,7 +84,7 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): | |||
|     vocab2 = Vocab(strings=strings2) | ||||
|     vocab1_b = vocab1.to_bytes() | ||||
|     vocab2_b = vocab2.to_bytes() | ||||
|     if strings1 == strings2: | ||||
|     if strings1.to_bytes() == strings2.to_bytes(): | ||||
|         assert vocab1_b == vocab2_b | ||||
|     else: | ||||
|         assert vocab1_b != vocab2_b | ||||
|  | @ -117,11 +120,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): | |||
| def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): | ||||
|     vocab1 = Vocab(strings=strings) | ||||
|     vocab2 = Vocab() | ||||
|     vocab1[strings[0]].norm_ = lex_attr | ||||
|     assert vocab1[strings[0]].norm_ == lex_attr | ||||
|     assert vocab2[strings[0]].norm_ != lex_attr | ||||
|     s = next(iter(vocab1.strings)) | ||||
|     vocab1[s].norm_ = lex_attr | ||||
|     assert vocab1[s].norm_ == lex_attr | ||||
|     assert vocab2[s].norm_ != lex_attr | ||||
|     vocab2 = vocab2.from_bytes(vocab1.to_bytes()) | ||||
|     assert vocab2[strings[0]].norm_ == lex_attr | ||||
|     assert vocab2[s].norm_ == lex_attr | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) | ||||
|  | @ -136,14 +140,15 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): | |||
| def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): | ||||
|     vocab1 = Vocab(strings=strings) | ||||
|     vocab2 = Vocab() | ||||
|     vocab1[strings[0]].norm_ = lex_attr | ||||
|     assert vocab1[strings[0]].norm_ == lex_attr | ||||
|     assert vocab2[strings[0]].norm_ != lex_attr | ||||
|     s = next(iter(vocab1.strings)) | ||||
|     vocab1[s].norm_ = lex_attr | ||||
|     assert vocab1[s].norm_ == lex_attr | ||||
|     assert vocab2[s].norm_ != lex_attr | ||||
|     with make_tempdir() as d: | ||||
|         file_path = d / "vocab" | ||||
|         vocab1.to_disk(file_path) | ||||
|         vocab2 = vocab2.from_disk(file_path) | ||||
|     assert vocab2[strings[0]].norm_ == lex_attr | ||||
|     assert vocab2[s].norm_ == lex_attr | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("strings1,strings2", test_strings) | ||||
|  |  | |||
|  | @ -2,7 +2,6 @@ import os | |||
| import math | ||||
| from collections import Counter | ||||
| from typing import Tuple, List, Dict, Any | ||||
| import pkg_resources | ||||
| import time | ||||
| from pathlib import Path | ||||
| 
 | ||||
|  | @ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys): | |||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.filterwarnings("ignore::DeprecationWarning") | ||||
| @pytest.mark.parametrize( | ||||
|     "reqs,output", | ||||
|     [ | ||||
|  | @ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys): | |||
|     ], | ||||
| ) | ||||
| def test_project_check_requirements(reqs, output): | ||||
|     import pkg_resources | ||||
| 
 | ||||
|     # excessive guard against unlikely package name | ||||
|     try: | ||||
|         pkg_resources.require("spacyunknowndoesnotexist12345") | ||||
|  |  | |||
|  | @ -1,5 +1,7 @@ | |||
| import os | ||||
| from pathlib import Path | ||||
| import pytest | ||||
| import srsly | ||||
| from typer.testing import CliRunner | ||||
| from spacy.tokens import DocBin, Doc | ||||
| 
 | ||||
|  | @ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab): | |||
|         # Instead of checking specific wording of the output, which may change, | ||||
|         # we'll check that this section of the debug output is present. | ||||
|         assert "= Trainable Lemmatizer =" in result_debug_data.stdout | ||||
| 
 | ||||
| 
 | ||||
| # project tests | ||||
| 
 | ||||
| SAMPLE_PROJECT = { | ||||
|     "title": "Sample project", | ||||
|     "description": "This is a project for testing", | ||||
|     "assets": [ | ||||
|         { | ||||
|             "dest": "assets/spacy-readme.md", | ||||
|             "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", | ||||
|             "checksum": "411b2c89ccf34288fae8ed126bf652f7", | ||||
|         }, | ||||
|         { | ||||
|             "dest": "assets/citation.cff", | ||||
|             "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", | ||||
|             "checksum": "c996bfd80202d480eb2e592369714e5e", | ||||
|             "extra": True, | ||||
|         }, | ||||
|     ], | ||||
|     "commands": [ | ||||
|         { | ||||
|             "name": "ok", | ||||
|             "help": "print ok", | ||||
|             "script": ["python -c \"print('okokok')\""], | ||||
|         }, | ||||
|         { | ||||
|             "name": "create", | ||||
|             "help": "make a file", | ||||
|             "script": ["touch abc.txt"], | ||||
|             "outputs": ["abc.txt"], | ||||
|         }, | ||||
|         { | ||||
|             "name": "clean", | ||||
|             "help": "remove test file", | ||||
|             "script": ["rm abc.txt"], | ||||
|         }, | ||||
|     ], | ||||
| } | ||||
| 
 | ||||
| SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def project_dir(): | ||||
|     with make_tempdir() as pdir: | ||||
|         (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) | ||||
|         yield pdir | ||||
| 
 | ||||
| 
 | ||||
| def test_project_document(project_dir): | ||||
|     readme_path = project_dir / "README.md" | ||||
|     assert not readme_path.exists(), "README already exists" | ||||
|     result = CliRunner().invoke( | ||||
|         app, ["project", "document", str(project_dir), "-o", str(readme_path)] | ||||
|     ) | ||||
|     assert result.exit_code == 0 | ||||
|     assert readme_path.is_file() | ||||
|     text = readme_path.read_text("utf-8") | ||||
|     assert SAMPLE_PROJECT["description"] in text | ||||
| 
 | ||||
| 
 | ||||
| def test_project_assets(project_dir): | ||||
|     asset_dir = project_dir / "assets" | ||||
|     assert not asset_dir.exists(), "Assets dir is already present" | ||||
|     result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) | ||||
|     assert result.exit_code == 0 | ||||
|     assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" | ||||
|     # check that extras work | ||||
|     result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) | ||||
|     assert result.exit_code == 0 | ||||
|     assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" | ||||
| 
 | ||||
| 
 | ||||
| def test_project_run(project_dir): | ||||
|     # make sure dry run works | ||||
|     test_file = project_dir / "abc.txt" | ||||
|     result = CliRunner().invoke( | ||||
|         app, ["project", "run", "--dry", "create", str(project_dir)] | ||||
|     ) | ||||
|     assert result.exit_code == 0 | ||||
|     assert not test_file.is_file() | ||||
|     result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) | ||||
|     assert result.exit_code == 0 | ||||
|     assert test_file.is_file() | ||||
|     result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) | ||||
|     assert result.exit_code == 0 | ||||
|     assert "okokok" in result.stdout | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "options", | ||||
|     [ | ||||
|         "", | ||||
|         # "--sparse", | ||||
|         "--branch v3", | ||||
|         "--repo https://github.com/explosion/projects --branch v3", | ||||
|     ], | ||||
| ) | ||||
| def test_project_clone(options): | ||||
|     with make_tempdir() as workspace: | ||||
|         out = workspace / "project" | ||||
|         target = "benchmarks/ner_conll03" | ||||
|         if not options: | ||||
|             options = [] | ||||
|         else: | ||||
|             options = options.split() | ||||
|         result = CliRunner().invoke( | ||||
|             app, ["project", "clone", target, *options, str(out)] | ||||
|         ) | ||||
|         assert result.exit_code == 0 | ||||
|         assert (out / "README.md").is_file() | ||||
| 
 | ||||
| 
 | ||||
| def test_project_push_pull(project_dir): | ||||
|     proj = dict(SAMPLE_PROJECT) | ||||
|     remote = "xyz" | ||||
| 
 | ||||
|     with make_tempdir() as remote_dir: | ||||
|         proj["remotes"] = {remote: str(remote_dir)} | ||||
|         proj_text = srsly.yaml_dumps(proj) | ||||
|         (project_dir / "project.yml").write_text(proj_text) | ||||
| 
 | ||||
|         test_file = project_dir / "abc.txt" | ||||
|         result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) | ||||
|         assert result.exit_code == 0 | ||||
|         assert test_file.is_file() | ||||
|         result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) | ||||
|         assert result.exit_code == 0 | ||||
|         result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) | ||||
|         assert result.exit_code == 0 | ||||
|         assert not test_file.exists() | ||||
|         result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) | ||||
|         assert result.exit_code == 0 | ||||
|         assert test_file.is_file() | ||||
|  |  | |||
|  | @ -98,7 +98,7 @@ def assert_sents_error(doc): | |||
| 
 | ||||
| def warn_error(proc_name, proc, docs, e): | ||||
|     logger = logging.getLogger("spacy") | ||||
|     logger.warning(f"Trouble with component {proc_name}.") | ||||
|     logger.warning("Trouble with component %s.", proc_name) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  |  | |||
|  | @ -17,7 +17,7 @@ def test_issue361(en_vocab, text1, text2): | |||
| 
 | ||||
| @pytest.mark.issue(600) | ||||
| def test_issue600(): | ||||
|     vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) | ||||
|     vocab = Vocab() | ||||
|     doc = Doc(vocab, words=["hello"]) | ||||
|     doc[0].tag_ = "NN" | ||||
| 
 | ||||
|  |  | |||
|  | @ -105,6 +105,7 @@ class Doc: | |||
|         start_idx: int, | ||||
|         end_idx: int, | ||||
|         label: Union[int, str] = ..., | ||||
|         *, | ||||
|         kb_id: Union[int, str] = ..., | ||||
|         vector: Optional[Floats1d] = ..., | ||||
|         alignment_mode: str = ..., | ||||
|  | @ -127,12 +128,12 @@ class Doc: | |||
|         blocked: Optional[List[Span]] = ..., | ||||
|         missing: Optional[List[Span]] = ..., | ||||
|         outside: Optional[List[Span]] = ..., | ||||
|         default: str = ... | ||||
|         default: str = ..., | ||||
|     ) -> None: ... | ||||
|     @property | ||||
|     def noun_chunks(self) -> Iterator[Span]: ... | ||||
|     def noun_chunks(self) -> Tuple[Span]: ... | ||||
|     @property | ||||
|     def sents(self) -> Iterator[Span]: ... | ||||
|     def sents(self) -> Tuple[Span]: ... | ||||
|     @property | ||||
|     def lang(self) -> int: ... | ||||
|     @property | ||||
|  |  | |||
|  | @ -520,7 +520,7 @@ cdef class Doc: | |||
|     def doc(self): | ||||
|         return self | ||||
| 
 | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): | ||||
|         """Create a `Span` object from the slice | ||||
|         `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be | ||||
|         created. | ||||
|  | @ -657,9 +657,6 @@ cdef class Doc: | |||
|             elif self.vocab.vectors.size > 0: | ||||
|                 self._vector = sum(t.vector for t in self) / len(self) | ||||
|                 return self._vector | ||||
|             elif self.tensor.size > 0: | ||||
|                 self._vector = self.tensor.mean(axis=0) | ||||
|                 return self._vector | ||||
|             else: | ||||
|                 return xp.zeros((self.vocab.vectors_length,), dtype="float32") | ||||
| 
 | ||||
|  | @ -706,10 +703,10 @@ cdef class Doc: | |||
|         return self.text | ||||
| 
 | ||||
|     property ents: | ||||
|         """The named entities in the document. Returns a tuple of named entity | ||||
|         """The named entities in the document. Returns a list of named entity | ||||
|         `Span` objects, if the entity recognizer has been applied. | ||||
| 
 | ||||
|         RETURNS (tuple): Entities in the document, one `Span` per entity. | ||||
|         RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#ents | ||||
|         """ | ||||
|  | @ -867,7 +864,7 @@ cdef class Doc: | |||
|         NP-level coordination, no prepositional phrases, and no relative | ||||
|         clauses. | ||||
| 
 | ||||
|         YIELDS (Span): Noun chunks in the document. | ||||
|         RETURNS (Tuple[Span]): Noun chunks in the document. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#noun_chunks | ||||
|         """ | ||||
|  | @ -876,36 +873,35 @@ cdef class Doc: | |||
| 
 | ||||
|         # Accumulate the result before beginning to iterate over it. This | ||||
|         # prevents the tokenization from being changed out from under us | ||||
|         # during the iteration. The tricky thing here is that Span accepts | ||||
|         # its tokenization changing, so it's okay once we have the Span | ||||
|         # objects. See Issue #375. | ||||
|         # during the iteration. | ||||
|         spans = [] | ||||
|         for start, end, label in self.noun_chunks_iterator(self): | ||||
|             spans.append(Span(self, start, end, label=label)) | ||||
|         for span in spans: | ||||
|             yield span | ||||
|         return tuple(spans) | ||||
| 
 | ||||
|     @property | ||||
|     def sents(self): | ||||
|         """Iterate over the sentences in the document. Yields sentence `Span` | ||||
|         objects. Sentence spans have no label. | ||||
| 
 | ||||
|         YIELDS (Span): Sentences in the document. | ||||
|         RETURNS (Tuple[Span]): Sentences in the document. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/doc#sents | ||||
|         """ | ||||
|         if not self.has_annotation("SENT_START"): | ||||
|             raise ValueError(Errors.E030) | ||||
|         if "sents" in self.user_hooks: | ||||
|             yield from self.user_hooks["sents"](self) | ||||
|             return tuple(self.user_hooks["sents"](self)) | ||||
|         else: | ||||
|             start = 0 | ||||
|             spans = [] | ||||
|             for i in range(1, self.length): | ||||
|                 if self.c[i].sent_start == 1: | ||||
|                     yield Span(self, start, i) | ||||
|                     spans.append(Span(self, start, i)) | ||||
|                     start = i | ||||
|             if start != self.length: | ||||
|                 yield Span(self, start, self.length) | ||||
|                 spans.append(Span(self, start, self.length)) | ||||
|             return tuple(spans) | ||||
| 
 | ||||
|     @property | ||||
|     def lang(self): | ||||
|  | @ -1605,7 +1601,7 @@ cdef class Doc: | |||
|         for span_group in doc_json.get("spans", {}): | ||||
|             spans = [] | ||||
|             for span in doc_json["spans"][span_group]: | ||||
|                 char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"]) | ||||
|                 char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"]) | ||||
|                 if char_span is None: | ||||
|                     raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"])) | ||||
|                 spans.append(char_span) | ||||
|  |  | |||
|  | @ -74,6 +74,8 @@ class Span: | |||
|     @property | ||||
|     def ents(self) -> Tuple[Span]: ... | ||||
|     @property | ||||
|     def sents(self) -> Tuple[Span]: ... | ||||
|     @property | ||||
|     def has_vector(self) -> bool: ... | ||||
|     @property | ||||
|     def vector(self) -> Floats1d: ... | ||||
|  | @ -86,7 +88,7 @@ class Span: | |||
|     @property | ||||
|     def text_with_ws(self) -> str: ... | ||||
|     @property | ||||
|     def noun_chunks(self) -> Iterator[Span]: ... | ||||
|     def noun_chunks(self) -> Tuple[Span]: ... | ||||
|     @property | ||||
|     def root(self) -> Token: ... | ||||
|     def char_span( | ||||
|  | @ -94,6 +96,7 @@ class Span: | |||
|         start_idx: int, | ||||
|         end_idx: int, | ||||
|         label: Union[int, str] = ..., | ||||
|         *, | ||||
|         kb_id: Union[int, str] = ..., | ||||
|         vector: Optional[Floats1d] = ..., | ||||
|         alignment_mode: str = ..., | ||||
|  |  | |||
|  | @ -461,20 +461,21 @@ cdef class Span: | |||
|         """Obtain the sentences that contain this span. If the given span | ||||
|         crosses sentence boundaries, return all sentences it is a part of. | ||||
| 
 | ||||
|         RETURNS (Iterable[Span]): All sentences that the span is a part of. | ||||
|         RETURNS (Tuple[Span]): All sentences that the span is a part of. | ||||
| 
 | ||||
|          DOCS: https://spacy.io/api/span#sents | ||||
|         DOCS: https://spacy.io/api/span#sents | ||||
|         """ | ||||
|         cdef int start | ||||
|         cdef int i | ||||
| 
 | ||||
|         if "sents" in self.doc.user_span_hooks: | ||||
|             yield from self.doc.user_span_hooks["sents"](self) | ||||
|         elif "sents" in self.doc.user_hooks: | ||||
|             return tuple(self.doc.user_span_hooks["sents"](self)) | ||||
|         spans = [] | ||||
|         if "sents" in self.doc.user_hooks: | ||||
|             for sentence in self.doc.user_hooks["sents"](self.doc): | ||||
|                 if sentence.end > self.start: | ||||
|                     if sentence.start < self.end or sentence.start == self.start == self.end: | ||||
|                         yield sentence | ||||
|                         spans.append(sentence) | ||||
|                     else: | ||||
|                         break | ||||
|         else: | ||||
|  | @ -489,12 +490,13 @@ cdef class Span: | |||
|             # Now, find all the sentences in the span | ||||
|             for i in range(start + 1, self.doc.length): | ||||
|                 if self.doc.c[i].sent_start == 1: | ||||
|                     yield Span(self.doc, start, i) | ||||
|                     spans.append(Span(self.doc, start, i)) | ||||
|                     start = i | ||||
|                     if start >= self.end: | ||||
|                         break | ||||
|             if start < self.end: | ||||
|                 yield Span(self.doc, start, self.end) | ||||
|                 spans.append(Span(self.doc, start, self.end)) | ||||
|         return tuple(spans) | ||||
| 
 | ||||
| 
 | ||||
|     @property | ||||
|  | @ -502,7 +504,7 @@ cdef class Span: | |||
|         """The named entities that fall completely within the span. Returns | ||||
|         a tuple of `Span` objects. | ||||
| 
 | ||||
|         RETURNS (tuple): Entities in the span, one `Span` per entity. | ||||
|         RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#ents | ||||
|         """ | ||||
|  | @ -517,7 +519,7 @@ cdef class Span: | |||
|                     ents.append(ent) | ||||
|                 else: | ||||
|                     break | ||||
|         return ents | ||||
|         return tuple(ents) | ||||
| 
 | ||||
|     @property | ||||
|     def has_vector(self): | ||||
|  | @ -532,8 +534,6 @@ cdef class Span: | |||
|             return self.doc.user_span_hooks["has_vector"](self) | ||||
|         elif self.vocab.vectors.size > 0: | ||||
|             return any(token.has_vector for token in self) | ||||
|         elif self.doc.tensor.size > 0: | ||||
|             return True | ||||
|         else: | ||||
|             return False | ||||
| 
 | ||||
|  | @ -615,13 +615,15 @@ cdef class Span: | |||
|         NP-level coordination, no prepositional phrases, and no relative | ||||
|         clauses. | ||||
| 
 | ||||
|         YIELDS (Span): Noun chunks in the span. | ||||
|         RETURNS (Tuple[Span]): Noun chunks in the span. | ||||
| 
 | ||||
|         DOCS: https://spacy.io/api/span#noun_chunks | ||||
|         """ | ||||
|         spans = [] | ||||
|         for span in self.doc.noun_chunks: | ||||
|             if span.start >= self.start and span.end <= self.end: | ||||
|                 yield span | ||||
|                 spans.append(span) | ||||
|         return tuple(spans) | ||||
| 
 | ||||
|     @property | ||||
|     def root(self): | ||||
|  | @ -666,11 +668,11 @@ cdef class Span: | |||
|         else: | ||||
|             return self.doc[root] | ||||
| 
 | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): | ||||
|         """Create a `Span` object from the slice `span.text[start : end]`. | ||||
| 
 | ||||
|         start (int): The index of the first character of the span. | ||||
|         end (int): The index of the first character after the span. | ||||
|         start_idx (int): The index of the first character of the span. | ||||
|         end_idx (int): The index of the first character after the span. | ||||
|         label (Union[int, str]): A label to attach to the Span, e.g. for | ||||
|             named entities. | ||||
|         kb_id (Union[int, str]):  An ID from a KB to capture the meaning of a named entity. | ||||
|  |  | |||
|  | @ -389,8 +389,6 @@ cdef class Token: | |||
|         """ | ||||
|         if "has_vector" in self.doc.user_token_hooks: | ||||
|             return self.doc.user_token_hooks["has_vector"](self) | ||||
|         if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: | ||||
|             return True | ||||
|         return self.vocab.has_vector(self.c.lex.orth) | ||||
| 
 | ||||
|     @property | ||||
|  | @ -404,8 +402,6 @@ cdef class Token: | |||
|         """ | ||||
|         if "vector" in self.doc.user_token_hooks: | ||||
|             return self.doc.user_token_hooks["vector"](self) | ||||
|         if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: | ||||
|             return self.doc.tensor[self.i] | ||||
|         else: | ||||
|             return self.vocab.get_vector(self.c.lex.orth) | ||||
| 
 | ||||
|  |  | |||
|  | @ -11,7 +11,7 @@ def create_copy_from_base_model( | |||
| ) -> Callable[[Language], Language]: | ||||
|     def copy_from_base_model(nlp): | ||||
|         if tokenizer: | ||||
|             logger.info(f"Copying tokenizer from: {tokenizer}") | ||||
|             logger.info("Copying tokenizer from: %s", tokenizer) | ||||
|             base_nlp = load_model(tokenizer) | ||||
|             if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: | ||||
|                 nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) | ||||
|  | @ -23,7 +23,7 @@ def create_copy_from_base_model( | |||
|                     ) | ||||
|                 ) | ||||
|         if vocab: | ||||
|             logger.info(f"Copying vocab from: {vocab}") | ||||
|             logger.info("Copying vocab from: %s", vocab) | ||||
|             # only reload if the vocab is from a different model | ||||
|             if tokenizer != vocab: | ||||
|                 base_nlp = load_model(vocab) | ||||
|  |  | |||
|  | @ -29,7 +29,7 @@ def create_docbin_reader( | |||
| ) -> Callable[["Language"], Iterable[Example]]: | ||||
|     if path is None: | ||||
|         raise ValueError(Errors.E913) | ||||
|     util.logger.debug(f"Loading corpus from path: {path}") | ||||
|     util.logger.debug("Loading corpus from path: %s", path) | ||||
|     return Corpus( | ||||
|         path, | ||||
|         gold_preproc=gold_preproc, | ||||
|  |  | |||
|  | @ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | |||
|     frozen_components = T["frozen_components"] | ||||
|     # Sourced components that require resume_training | ||||
|     resume_components = [p for p in sourced if p not in frozen_components] | ||||
|     logger.info(f"Pipeline: {nlp.pipe_names}") | ||||
|     logger.info("Pipeline: %s", nlp.pipe_names) | ||||
|     if resume_components: | ||||
|         with nlp.select_pipes(enable=resume_components): | ||||
|             logger.info(f"Resuming training for: {resume_components}") | ||||
|             logger.info("Resuming training for: %s", resume_components) | ||||
|             nlp.resume_training(sgd=optimizer) | ||||
|     # Make sure that listeners are defined before initializing further | ||||
|     nlp._link_components() | ||||
|  | @ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | |||
|         if T["max_epochs"] == -1: | ||||
|             sample_size = 100 | ||||
|             logger.debug( | ||||
|                 f"Due to streamed train corpus, using only first {sample_size} " | ||||
|                 f"examples for initialization. If necessary, provide all labels " | ||||
|                 f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" | ||||
|                 "Due to streamed train corpus, using only first %s examples for initialization. " | ||||
|                 "If necessary, provide all labels in [initialize]. " | ||||
|                 "More info: https://spacy.io/api/cli#init_labels", | ||||
|                 sample_size, | ||||
|             ) | ||||
|             nlp.initialize( | ||||
|                 lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer | ||||
|             ) | ||||
|         else: | ||||
|             nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) | ||||
|         logger.info(f"Initialized pipeline components: {nlp.pipe_names}") | ||||
|         logger.info("Initialized pipeline components: %s", nlp.pipe_names) | ||||
|     # Detect components with listeners that are not frozen consistently | ||||
|     for name, proc in nlp.pipeline: | ||||
|         for listener in getattr( | ||||
|  | @ -109,7 +110,7 @@ def init_vocab( | |||
| ) -> None: | ||||
|     if lookups: | ||||
|         nlp.vocab.lookups = lookups | ||||
|         logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") | ||||
|         logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) | ||||
|     data_path = ensure_path(data) | ||||
|     if data_path is not None: | ||||
|         lex_attrs = srsly.read_jsonl(data_path) | ||||
|  | @ -125,11 +126,11 @@ def init_vocab( | |||
|         else: | ||||
|             oov_prob = DEFAULT_OOV_PROB | ||||
|         nlp.vocab.cfg.update({"oov_prob": oov_prob}) | ||||
|         logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") | ||||
|         logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) | ||||
|     logger.info("Created vocabulary") | ||||
|     if vectors is not None: | ||||
|         load_vectors_into_model(nlp, vectors) | ||||
|         logger.info(f"Added vectors: {vectors}") | ||||
|         logger.info("Added vectors: %s", vectors) | ||||
|     # warn if source model vectors are not identical | ||||
|     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) | ||||
|     vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) | ||||
|  | @ -191,7 +192,7 @@ def init_tok2vec( | |||
|     if weights_data is not None: | ||||
|         layer = get_tok2vec_ref(nlp, P) | ||||
|         layer.from_bytes(weights_data) | ||||
|         logger.info(f"Loaded pretrained weights from {init_tok2vec}") | ||||
|         logger.info("Loaded pretrained weights from %s", init_tok2vec) | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
|  | @ -215,13 +216,13 @@ def convert_vectors( | |||
|         nlp.vocab.deduplicate_vectors() | ||||
|     else: | ||||
|         if vectors_loc: | ||||
|             logger.info(f"Reading vectors from {vectors_loc}") | ||||
|             logger.info("Reading vectors from %s", vectors_loc) | ||||
|             vectors_data, vector_keys, floret_settings = read_vectors( | ||||
|                 vectors_loc, | ||||
|                 truncate, | ||||
|                 mode=mode, | ||||
|             ) | ||||
|             logger.info(f"Loaded vectors from {vectors_loc}") | ||||
|             logger.info("Loaded vectors from %s", vectors_loc) | ||||
|         else: | ||||
|             vectors_data, vector_keys = (None, None) | ||||
|         if vector_keys is not None and mode != VectorsMode.floret: | ||||
|  |  | |||
|  | @ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None: | |||
|             if subdir.exists(): | ||||
|                 try: | ||||
|                     shutil.rmtree(str(subdir)) | ||||
|                     logger.debug(f"Removed existing output directory: {subdir}") | ||||
|                     logger.debug("Removed existing output directory: %s", subdir) | ||||
|                 except Exception as e: | ||||
|                     raise IOError(Errors.E901.format(path=path)) from e | ||||
|  |  | |||
|  | @ -33,6 +33,7 @@ import inspect | |||
| import pkgutil | ||||
| import logging | ||||
| import socket | ||||
| import stat | ||||
| 
 | ||||
| try: | ||||
|     import cupy.random | ||||
|  | @ -139,8 +140,17 @@ class registry(thinc.registry): | |||
|         return func | ||||
| 
 | ||||
|     @classmethod | ||||
|     def find(cls, registry_name: str, func_name: str) -> Callable: | ||||
|         """Get info about a registered function from the registry.""" | ||||
|     def find( | ||||
|         cls, registry_name: str, func_name: str | ||||
|     ) -> Dict[str, Optional[Union[str, int]]]: | ||||
|         """Find information about a registered function, including the | ||||
|         module and path to the file it's defined in, the line number and the | ||||
|         docstring, if available. | ||||
| 
 | ||||
|         registry_name (str): Name of the catalogue registry. | ||||
|         func_name (str): Name of the registered function. | ||||
|         RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. | ||||
|         """ | ||||
|         # We're overwriting this classmethod so we're able to provide more | ||||
|         # specific error messages and implement a fallback to spacy-legacy. | ||||
|         if not hasattr(cls, registry_name): | ||||
|  | @ -1030,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]: | |||
|     """ | ||||
|     d = Path(tempfile.mkdtemp()) | ||||
|     yield d | ||||
| 
 | ||||
|     # On Windows, git clones use read-only files, which cause permission errors | ||||
|     # when being deleted. This forcibly fixes permissions. | ||||
|     def force_remove(rmfunc, path, ex): | ||||
|         os.chmod(path, stat.S_IWRITE) | ||||
|         rmfunc(path) | ||||
| 
 | ||||
|     try: | ||||
|         shutil.rmtree(str(d)) | ||||
|         shutil.rmtree(str(d), onerror=force_remove) | ||||
|     except PermissionError as e: | ||||
|         warnings.warn(Warnings.W091.format(dir=d, msg=e)) | ||||
| 
 | ||||
|  |  | |||
|  | @ -26,7 +26,7 @@ class Vocab: | |||
|     def __init__( | ||||
|         self, | ||||
|         lex_attr_getters: Optional[Dict[str, Callable[[str], Any]]] = ..., | ||||
|         strings: Optional[Union[List[str], StringStore]] = ..., | ||||
|         strings: Optional[StringStore] = ..., | ||||
|         lookups: Optional[Lookups] = ..., | ||||
|         oov_prob: float = ..., | ||||
|         writing_system: Dict[str, Any] = ..., | ||||
|  |  | |||
|  | @ -49,9 +49,8 @@ cdef class Vocab: | |||
| 
 | ||||
|     DOCS: https://spacy.io/api/vocab | ||||
|     """ | ||||
|     def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, | ||||
|                  oov_prob=-20., writing_system={}, get_noun_chunks=None, | ||||
|                  **deprecated_kwargs): | ||||
|     def __init__(self, lex_attr_getters=None, strings=None, lookups=None, | ||||
|             oov_prob=-20., writing_system=None, get_noun_chunks=None): | ||||
|         """Create the vocabulary. | ||||
| 
 | ||||
|         lex_attr_getters (dict): A dictionary mapping attribute IDs to | ||||
|  | @ -69,16 +68,19 @@ cdef class Vocab: | |||
|         self.cfg = {'oov_prob': oov_prob} | ||||
|         self.mem = Pool() | ||||
|         self._by_orth = PreshMap() | ||||
|         self.strings = StringStore() | ||||
|         self.length = 0 | ||||
|         if strings: | ||||
|             for string in strings: | ||||
|                 _ = self[string] | ||||
|         if strings is None: | ||||
|             self.strings = StringStore() | ||||
|         else: | ||||
|             self.strings = strings | ||||
|         self.lex_attr_getters = lex_attr_getters | ||||
|         self.morphology = Morphology(self.strings) | ||||
|         self.vectors = Vectors(strings=self.strings) | ||||
|         self.lookups = lookups | ||||
|         self.writing_system = writing_system | ||||
|         if writing_system is None: | ||||
|             self.writing_system = {} | ||||
|         else: | ||||
|             self.writing_system = writing_system | ||||
|         self.get_noun_chunks = get_noun_chunks | ||||
| 
 | ||||
|     property vectors: | ||||
|  |  | |||
|  | @ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | |||
| | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    | | ||||
| 
 | ||||
| ### spacy.EmptyKB.v1 {id="EmptyKB"} | ||||
| ### spacy.EmptyKB.v1 {id="EmptyKB.v1"} | ||||
| 
 | ||||
| A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) | ||||
| instance. This is the default when a new entity linker component is created. | ||||
| instance. | ||||
| 
 | ||||
| | Name                   | Description                                                                         | | ||||
| | ---------------------- | ----------------------------------------------------------------------------------- | | ||||
| | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | | ||||
| 
 | ||||
| ### spacy.EmptyKB.v2 {id="EmptyKB"} | ||||
| 
 | ||||
| A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) | ||||
| instance. This is the default when a new entity linker component is created. It | ||||
| returns a `Callable[[Vocab, int], InMemoryLookupKB]`. | ||||
| 
 | ||||
| ### spacy.KBFromFile.v1 {id="KBFromFile"} | ||||
| 
 | ||||
| A function that reads an existing `KnowledgeBase` from file. | ||||
|  | @ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default | |||
| `CandidateGenerator` uses the text of a mention to find its potential aliases in | ||||
| the `KnowledgeBase`. Note that this function is case-dependent. | ||||
| 
 | ||||
| ### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} | ||||
| 
 | ||||
| A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of | ||||
| [`Span`](/api/span) objects denoting named entities, and returns a list of | ||||
| plausible [`Candidate`](/api/kb/#candidate) objects per specified | ||||
| [`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a | ||||
| mention to find its potential aliases in the `KnowledgeBase`. Note that this | ||||
| function is case-dependent. | ||||
| 
 | ||||
| ## Coreference {id="coref-architectures",tag="experimental"} | ||||
| 
 | ||||
| A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to | ||||
|  |  | |||
|  | @ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir] | |||
| ### project pull {id="project-pull",tag="command"} | ||||
| 
 | ||||
| Download all files or directories listed as `outputs` for commands, unless they | ||||
| are not already present locally. When searching for files in the remote, `pull` | ||||
| are already present locally. When searching for files in the remote, `pull` | ||||
| won't just look at the output path, but will also consider the **command | ||||
| string** and the **hashes of the dependencies**. For instance, let's say you've | ||||
| previously pushed a checkpoint to the remote, but now you've changed some | ||||
|  |  | |||
|  | @ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which | |||
| come directly from | ||||
| [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): | ||||
| 
 | ||||
| | Symbol    | Description                                                                                                          | | ||||
| | --------- | -------------------------------------------------------------------------------------------------------------------- | | ||||
| | `A < B`   | `A` is the immediate dependent of `B`.                                                                               | | ||||
| | `A > B`   | `A` is the immediate head of `B`.                                                                                    | | ||||
| | `A << B`  | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | ||||
| | `A >> B`  | `A` is the head in a chain to `B` following head → dep paths.                                                   | | ||||
| | `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | ||||
| | `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | ||||
| | `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | ||||
| | `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | ||||
| | `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | ||||
| | `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | ||||
| | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | ||||
| | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | ||||
| | `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||
| | `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||
| | `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||
| | `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||
| | Symbol                                  | Description                                                                                                          | | ||||
| | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | | ||||
| | `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               | | ||||
| | `A > B`                                 | `A` is the immediate head of `B`.                                                                                    | | ||||
| | `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | ||||
| | `A >> B`                                | `A` is the head in a chain to `B` following head → dep paths.                                                   | | ||||
| | `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | ||||
| | `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | ||||
| | `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | ||||
| | `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | ||||
| | `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | ||||
| | `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | ||||
| | `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | ||||
| | `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | ||||
| | `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||
| | `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||
| | `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||
| | `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||
| | `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||
| | `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||
| | `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||
| | `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||
| 
 | ||||
| ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -214,6 +214,7 @@ alignment mode `"strict". | |||
| | `start`                                  | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        | | ||||
| | `end`                                    | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      | | ||||
| | `label`                                  | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  | | ||||
| | _keyword-only_                           |                                                                                                                                                                                                                                                                              | | ||||
| | `kb_id`                                  | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    | | ||||
| | `vector`                                 | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               | | ||||
| | `alignment_mode`                         | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | ||||
|  | @ -653,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer). | |||
| 
 | ||||
| ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} | ||||
| 
 | ||||
| Iterate over the base noun phrases in the document. Yields base noun-phrase | ||||
| `Span` objects, if the document has been syntactically parsed. A base noun | ||||
| phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be | ||||
| nested within it – so no NP-level coordination, no prepositional phrases, and no | ||||
| relative clauses. | ||||
| Returns a tuple of the base noun phrases in the doc, if the document has been | ||||
| syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that | ||||
| does not permit other NPs to be nested within it – so no NP-level coordination, | ||||
| no prepositional phrases, and no relative clauses. | ||||
| 
 | ||||
| To customize the noun chunk iterator in a loaded pipeline, modify | ||||
| [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` | ||||
|  | @ -674,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised. | |||
| > assert chunks[1].text == "another phrase" | ||||
| > ``` | ||||
| 
 | ||||
| | Name       | Description                           | | ||||
| | ---------- | ------------------------------------- | | ||||
| | **YIELDS** | Noun chunks in the document. ~~Span~~ | | ||||
| | Name        | Description                                  | | ||||
| | ----------- | -------------------------------------------- | | ||||
| | **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ | | ||||
| 
 | ||||
| ## Doc.sents {id="sents",tag="property",model="sentences"} | ||||
| 
 | ||||
| Iterate over the sentences in the document. Sentence spans have no label. | ||||
| Returns a tuple of the sentences in the document. Sentence spans have no label. | ||||
| 
 | ||||
| This property is only available when | ||||
| [sentence boundaries](/usage/linguistic-features#sbd) have been set on the | ||||
|  | @ -696,9 +696,9 @@ will raise an error otherwise. | |||
| > assert [s.root.text for s in sents] == ["is", "'s"] | ||||
| > ``` | ||||
| 
 | ||||
| | Name       | Description                         | | ||||
| | ---------- | ----------------------------------- | | ||||
| | **YIELDS** | Sentences in the document. ~~Span~~ | | ||||
| | Name        | Description                                | | ||||
| | ----------- | ------------------------------------------ | | ||||
| | **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ | | ||||
| 
 | ||||
| ## Doc.has_vector {id="has_vector",tag="property",model="vectors"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -53,20 +53,22 @@ architectures and their arguments and hyperparameters. | |||
| > nlp.add_pipe("entity_linker", config=config) | ||||
| > ``` | ||||
| 
 | ||||
| | Setting                                         | Description                                                                                                                                                                                                                                                                                 | | ||||
| | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              | | ||||
| | `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           | | ||||
| | `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        | | ||||
| | `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      | | ||||
| | `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      | | ||||
| | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               | | ||||
| | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        | | ||||
| | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    | | ||||
| | `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   | | ||||
| | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     | | ||||
| | Setting                                             | Description                                                                                                                                                                                                                                                                                                      | | ||||
| | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   | | ||||
| | `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                | | ||||
| | `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             | | ||||
| | `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           | | ||||
| | `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           | | ||||
| | `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    | | ||||
| | `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             | | ||||
| | `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         | | ||||
| | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | | ||||
| | `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           | | ||||
| | `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         | | ||||
| | `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          | | ||||
| | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        | | ||||
| | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | ||||
| | `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      | | ||||
| 
 | ||||
| ```python | ||||
| %%GITHUB_SPACY/spacy/pipeline/entity_linker.py | ||||
|  |  | |||
|  | @ -10,9 +10,9 @@ version: 3.5 | |||
| 
 | ||||
| The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and | ||||
| implements all of its methods. It stores all KB data in-memory and generates | ||||
| [`Candidate`](/api/kb#candidate) objects by exactly matching mentions with | ||||
| entity names. It's highly optimized for both a low memory footprint and speed of | ||||
| retrieval. | ||||
| [`InMemoryCandidate`](/api/kb#candidate) objects by exactly matching mentions | ||||
| with entity names. It's highly optimized for both a low memory footprint and | ||||
| speed of retrieval. | ||||
| 
 | ||||
| ## InMemoryLookupKB.\_\_init\_\_ {id="init",tag="method"} | ||||
| 
 | ||||
|  | @ -156,7 +156,7 @@ Get a list of all aliases in the knowledge base. | |||
| ## InMemoryLookupKB.get_candidates {id="get_candidates",tag="method"} | ||||
| 
 | ||||
| Given a certain textual mention as input, retrieve a list of candidate entities | ||||
| of type [`Candidate`](/api/kb#candidate). Wraps | ||||
| of type [`InMemoryCandidate`](/api/kb#candidate). Wraps | ||||
| [`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). | ||||
| 
 | ||||
| > #### Example | ||||
|  | @ -168,10 +168,10 @@ of type [`Candidate`](/api/kb#candidate). Wraps | |||
| > candidates = kb.get_candidates(doc[0:2]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                          | | ||||
| | ----------- | -------------------------------------------------------------------- | | ||||
| | `mention`   | The textual mention or alias. ~~Span~~                               | | ||||
| | **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ | | ||||
| | Name        | Description                                                                          | | ||||
| | ----------- | ------------------------------------------------------------------------------------ | | ||||
| | `mention`   | The textual mention or alias. ~~Span~~                                               | | ||||
| | **RETURNS** | An iterable of relevant `InMemoryCandidate` objects. ~~Iterable[InMemoryCandidate]~~ | | ||||
| 
 | ||||
| ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} | ||||
| 
 | ||||
|  | @ -189,31 +189,16 @@ to you. | |||
| > | ||||
| > ```python | ||||
| > from spacy.lang.en import English | ||||
| > from spacy.tokens import SpanGroup | ||||
| > nlp = English() | ||||
| > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") | ||||
| > candidates = kb.get_candidates((doc[0:2], doc[3:])) | ||||
| > candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                                                  | | ||||
| | ----------- | -------------------------------------------------------------------------------------------- | | ||||
| | `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             | | ||||
| | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | | ||||
| 
 | ||||
| ## InMemoryLookupKB.get_alias_candidates {id="get_alias_candidates",tag="method"} | ||||
| 
 | ||||
| Given a certain textual mention as input, retrieve a list of candidate entities | ||||
| of type [`Candidate`](/api/kb#candidate). | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > candidates = kb.get_alias_candidates("Douglas") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                   | | ||||
| | ----------- | ------------------------------------------------------------- | | ||||
| | `alias`     | The textual mention or alias. ~~str~~                         | | ||||
| | **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | | ||||
| | Name        | Description                                                                                                  | | ||||
| | ----------- | ------------------------------------------------------------------------------------------------------------ | | ||||
| | `mentions`  | The textual mentions. ~~SpanGroup~~                                                                          | | ||||
| | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | | ||||
| 
 | ||||
| ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -93,33 +93,17 @@ to you. | |||
| > | ||||
| > ```python | ||||
| > from spacy.lang.en import English | ||||
| > from spacy.tokens import SpanGroup | ||||
| > nlp = English() | ||||
| > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") | ||||
| > candidates = kb.get_candidates((doc[0:2], doc[3:])) | ||||
| > candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                                                  | | ||||
| | ----------- | -------------------------------------------------------------------------------------------- | | ||||
| | `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             | | ||||
| | `mentions`  | The textual mentions. ~~SpanGroup~~                                                          | | ||||
| | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | | ||||
| 
 | ||||
| ## KnowledgeBase.get_alias_candidates {id="get_alias_candidates",tag="method"} | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
|   This method is _not_ available from spaCy 3.5 onwards. | ||||
| </Infobox> | ||||
| 
 | ||||
| From spaCy 3.5 on `KnowledgeBase` is an abstract class (with | ||||
| [`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to | ||||
| allow more flexibility in customizing knowledge bases. Some of its methods were | ||||
| moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, | ||||
| one of those being `get_alias_candidates()`. This method is now available as | ||||
| [`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). | ||||
| Note: | ||||
| [`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) | ||||
| defaults to | ||||
| [`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). | ||||
| 
 | ||||
| ## KnowledgeBase.get_vector {id="get_vector",tag="method"} | ||||
| 
 | ||||
| Given a certain entity ID, retrieve its pretrained entity vector. | ||||
|  | @ -190,25 +174,25 @@ Restore the state of the knowledge base from a given directory. Note that the | |||
| | `exclude`   | List of components to exclude. ~~Iterable[str]~~                                                | | ||||
| | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~                                          | | ||||
| 
 | ||||
| ## Candidate {id="candidate",tag="class"} | ||||
| ## InMemoryCandidate {id="candidate",tag="class"} | ||||
| 
 | ||||
| A `Candidate` object refers to a textual mention (alias) that may or may not be | ||||
| resolved to a specific entity from a `KnowledgeBase`. This will be used as input | ||||
| for the entity linking algorithm which will disambiguate the various candidates | ||||
| to the correct one. Each candidate `(alias, entity)` pair is assigned to a | ||||
| certain prior probability. | ||||
| An `InMemoryCandidate` object refers to a textual mention (alias) that may or | ||||
| may not be resolved to a specific entity from a `KnowledgeBase`. This will be | ||||
| used as input for the entity linking algorithm which will disambiguate the | ||||
| various candidates to the correct one. Each candidate `(alias, entity)` pair is | ||||
| assigned to a certain prior probability. | ||||
| 
 | ||||
| ### Candidate.\_\_init\_\_ {id="candidate-init",tag="method"} | ||||
| ### InMemoryCandidate.\_\_init\_\_ {id="candidate-init",tag="method"} | ||||
| 
 | ||||
| Construct a `Candidate` object. Usually this constructor is not called directly, | ||||
| but instead these objects are returned by the `get_candidates` method of the | ||||
| [`entity_linker`](/api/entitylinker) pipe. | ||||
| Construct an `InMemoryCandidate` object. Usually this constructor is not called | ||||
| directly, but instead these objects are returned by the `get_candidates` method | ||||
| of the [`entity_linker`](/api/entitylinker) pipe. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.kb import Candidate | ||||
| > candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) | ||||
| > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, | ||||
| > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) | ||||
| > ``` | ||||
| 
 | ||||
| | Name          | Description                                                               | | ||||
|  | @ -216,10 +200,10 @@ but instead these objects are returned by the `get_candidates` method of the | |||
| | `kb`          | The knowledge base that defined this candidate. ~~KnowledgeBase~~         | | ||||
| | `entity_hash` | The hash of the entity's KB ID. ~~int~~                                   | | ||||
| | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~                     | | ||||
| | `alias_hash`  | The hash of the textual mention or alias. ~~int~~                         | | ||||
| | `alias_hash`  | The hash of the entity alias. ~~int~~                                     | | ||||
| | `prior_prob`  | The prior probability of the `alias` referring to the `entity`. ~~float~~ | | ||||
| 
 | ||||
| ## Candidate attributes {id="candidate-attributes"} | ||||
| ## InMemoryCandidate attributes {id="candidate-attributes"} | ||||
| 
 | ||||
| | Name            | Description                                                              | | ||||
| | --------------- | ------------------------------------------------------------------------ | | ||||
|  |  | |||
|  | @ -188,9 +188,10 @@ the character indices don't map to a valid span. | |||
| 
 | ||||
| | Name                                            | Description                                                                                                                                                                                                                                                                  | | ||||
| | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        | | ||||
| | `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      | | ||||
| | `start_idx`                                     | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        | | ||||
| | `end_idx`                                       | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      | | ||||
| | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  | | ||||
| | _keyword-only_                                  |                                                                                                                                                                                                                                                                              | | ||||
| | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    | | ||||
| | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               | | ||||
| | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | ||||
|  | @ -274,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of | |||
| > assert ents[0].text == "Mr. Best" | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                       | | ||||
| | ----------- | ----------------------------------------------------------------- | | ||||
| | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | | ||||
| | Name        | Description                                                  | | ||||
| | ----------- | ------------------------------------------------------------ | | ||||
| | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ | | ||||
| 
 | ||||
| ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} | ||||
| 
 | ||||
| Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` | ||||
| objects, if the document has been syntactically parsed. A base noun phrase, or | ||||
| "NP chunk", is a noun phrase that does not permit other NPs to be nested within | ||||
| it – so no NP-level coordination, no prepositional phrases, and no relative | ||||
| clauses. | ||||
| Returns a tuple of the base noun phrases in the span if the document has been | ||||
| syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that | ||||
| does not permit other NPs to be nested within it – so no NP-level coordination, | ||||
| no prepositional phrases, and no relative clauses. | ||||
| 
 | ||||
| If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) | ||||
| has not been implemeted for the given language, a `NotImplementedError` is | ||||
|  | @ -300,9 +300,9 @@ raised. | |||
| > assert chunks[0].text == "another phrase" | ||||
| > ``` | ||||
| 
 | ||||
| | Name       | Description                       | | ||||
| | ---------- | --------------------------------- | | ||||
| | **YIELDS** | Noun chunks in the span. ~~Span~~ | | ||||
| | Name        | Description                              | | ||||
| | ----------- | ---------------------------------------- | | ||||
| | **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ | | ||||
| 
 | ||||
| ## Span.as_doc {id="as_doc",tag="method"} | ||||
| 
 | ||||
|  | @ -524,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)] | |||
| 
 | ||||
| ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} | ||||
| 
 | ||||
| Returns a generator over the sentences the span belongs to. This property is | ||||
| only available when [sentence boundaries](/usage/linguistic-features#sbd) have | ||||
| been set on the document by the `parser`, `senter`, `sentencizer` or some custom | ||||
| Returns a tuple of the sentences the span belongs to. This property is only | ||||
| available when [sentence boundaries](/usage/linguistic-features#sbd) have been | ||||
| set on the document by the `parser`, `senter`, `sentencizer` or some custom | ||||
| function. It will raise an error otherwise. | ||||
| 
 | ||||
| If the span happens to cross sentence boundaries, all sentences the span | ||||
|  | @ -540,9 +540,9 @@ overlaps with will be returned. | |||
| > assert len(span.sents) == 2 | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                                | | ||||
| | ----------- | -------------------------------------------------------------------------- | | ||||
| | **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | | ||||
| | Name        | Description                                                   | | ||||
| | ----------- | ------------------------------------------------------------- | | ||||
| | **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ | | ||||
| 
 | ||||
| ## Attributes {id="attributes"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of | |||
| integer IDs. This ensures that strings always map to the same ID, even from | ||||
| different `StringStores`. | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
| 
 | ||||
| Note that a `StringStore` instance is not static. It increases in size as texts | ||||
| with new tokens are processed. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ## StringStore.\_\_init\_\_ {id="init",tag="method"} | ||||
| 
 | ||||
| Create the `StringStore`. | ||||
|  |  | |||
|  | @ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both | |||
| | `doc`       | The document to process. ~~Doc~~ | | ||||
| | **RETURNS** | The processed document. ~~Doc~~  | | ||||
| 
 | ||||
| ## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"} | ||||
| 
 | ||||
| Performs an update of the student pipe's model using the student's distillation  | ||||
| examples and sets the annotations of the teacher's distillation examples using  | ||||
| the teacher pipe.  | ||||
| 
 | ||||
| Unlike other trainable pipes, the student pipe doesn't directly learn its  | ||||
| representations from the teacher. However, since downstream pipes that do  | ||||
| perform distillation expect the tok2vec annotations to be present on the  | ||||
| correct distillation examples, we need to ensure that they are set beforehand. | ||||
| 
 | ||||
| The distillation is performed on ~~Example~~ objects. The `Example.reference` | ||||
| and `Example.predicted` ~~Doc~~s must have the same number of tokens and the | ||||
| same orthography. Even though the reference does not need have to have gold | ||||
| annotations, the teacher could adds its own annotations when necessary. | ||||
| 
 | ||||
| This feature is experimental. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > teacher_pipe = teacher.add_pipe("tok2vec") | ||||
| > student_pipe = student.add_pipe("tok2vec") | ||||
| > optimizer = nlp.resume_training() | ||||
| > losses = student.distill(teacher_pipe, examples, sgd=optimizer) | ||||
| > ``` | ||||
| 
 | ||||
| | Name           | Description                                                                                                                                 | | ||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~                                                                                 | | ||||
| | `examples`     | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ | | ||||
| | _keyword-only_ |                                                                                                                                             | | ||||
| | `drop`         | Dropout rate. ~~float~~                                                                                                                     | | ||||
| | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               | | ||||
| | `losses`       | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                | | ||||
| | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                       | | ||||
| 
 | ||||
| ## Tok2Vec.pipe {id="pipe",tag="method"} | ||||
| 
 | ||||
| Apply the pipe to a stream of documents. This usually happens under the hood | ||||
|  |  | |||
|  | @ -355,22 +355,22 @@ If a setting is not present in the options, the default value will be used. | |||
| > displacy.serve(doc, style="dep", options=options) | ||||
| > ``` | ||||
| 
 | ||||
| | Name               | Description                                                                                                                                  | | ||||
| | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             | | ||||
| | `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                      | | ||||
| | `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | | ||||
| | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             | | ||||
| | `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    | | ||||
| | `color`            | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                                                                       | | ||||
| | `bg`               | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~                                                                 | | ||||
| | `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                        | | ||||
| | `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                             | | ||||
| | `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                          | | ||||
| | `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                 | | ||||
| | `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                           | | ||||
| | `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                     | | ||||
| | `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                           | | ||||
| | Name               | Description                                                                                                                                                                                                                                   | | ||||
| | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~                                                                                                              | | ||||
| | `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                                                                                                                       | | ||||
| | `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~                                                                                                  | | ||||
| | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                                                                                                                              | | ||||
| | `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     | | ||||
| | `color`            | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       | | ||||
| | `bg`               | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | | ||||
| | `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         | | ||||
| | `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                                                                                                                              | | ||||
| | `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                                                                                                                           | | ||||
| | `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                                                                                                                  | | ||||
| | `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                                                                                                                            | | ||||
| | `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                                                                                                                      | | ||||
| | `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                                                                                                                            | | ||||
| 
 | ||||
| #### Named Entity Visualizer options {id="displacy_options-ent"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access | |||
| [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared | ||||
| between `Doc` objects. | ||||
| 
 | ||||
| <Infobox variant="warning"> | ||||
| 
 | ||||
| Note that a `Vocab` instance is not static. It increases in size as texts with | ||||
| new tokens are processed. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ## Vocab.\_\_init\_\_ {id="init",tag="method"} | ||||
| 
 | ||||
| Create the vocabulary. | ||||
|  | @ -17,14 +24,15 @@ Create the vocabulary. | |||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.strings import StringStore | ||||
| > from spacy.vocab import Vocab | ||||
| > vocab = Vocab(strings=["hello", "world"]) | ||||
| > vocab = Vocab(strings=StringStore(["hello", "world"])) | ||||
| > ``` | ||||
| 
 | ||||
| | Name               | Description                                                                                                                                                             | | ||||
| | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~                                      | | ||||
| | `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~                           | | ||||
| | `strings`          | A [`StringStore`](/api/stringstore) that maps strings to hash values. ~~Optional[StringStore]~~                                                                         | | ||||
| | `lookups`          | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~                                      | | ||||
| | `oov_prob`         | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                                             | | ||||
| | `writing_system`   | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~                          | | ||||
|  |  | |||
|  | @ -22,17 +22,20 @@ array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01, | |||
| <Infobox title="Important note" variant="warning"> | ||||
| 
 | ||||
| To make them compact and fast, spaCy's small [pipeline packages](/models) (all | ||||
| packages that end in `sm`) **don't ship with word vectors**, and only include | ||||
| context-sensitive **tensors**. This means you can still use the `similarity()` | ||||
| methods to compare documents, spans and tokens – but the result won't be as | ||||
| good, and individual tokens won't have any vectors assigned. So in order to use | ||||
| _real_ word vectors, you need to download a larger pipeline package: | ||||
| packages that end in `sm`) **don't ship with word vectors**. In order to use | ||||
| `similarity()`, you need to download a larger pipeline package that includes | ||||
| vectors: | ||||
| 
 | ||||
| ```diff | ||||
| - python -m spacy download en_core_web_sm | ||||
| + python -m spacy download en_core_web_lg | ||||
| + python -m spacy download en_core_web_md | ||||
| ``` | ||||
| 
 | ||||
| In spaCy v3 and earlier, small pipeline packages supported `similarity()` by | ||||
| backing off to context-sensitive tensors from the `tok2vec` component. These | ||||
| tensors do not work well for this purpose and this backoff has been removed in | ||||
| spaCy v4. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| Pipeline packages that come with built-in word vectors make them available as | ||||
|  |  | |||
|  | @ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which | |||
| come directly from | ||||
| [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): | ||||
| 
 | ||||
| | Symbol    | Description                                                                                                          | | ||||
| | --------- | -------------------------------------------------------------------------------------------------------------------- | | ||||
| | `A < B`   | `A` is the immediate dependent of `B`.                                                                               | | ||||
| | `A > B`   | `A` is the immediate head of `B`.                                                                                    | | ||||
| | `A << B`  | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | ||||
| | `A >> B`  | `A` is the head in a chain to `B` following head → dep paths.                                                   | | ||||
| | `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | ||||
| | `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | ||||
| | `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | ||||
| | `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | ||||
| | `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | ||||
| | `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | ||||
| | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | ||||
| | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | ||||
| | Symbol                                  | Description                                                                                                          | | ||||
| | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | | ||||
| | `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               | | ||||
| | `A > B`                                 | `A` is the immediate head of `B`.                                                                                    | | ||||
| | `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | ||||
| | `A >> B`                                | `A` is the head in a chain to `B` following head → dep paths.                                                   | | ||||
| | `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | ||||
| | `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | ||||
| | `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | ||||
| | `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | ||||
| | `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | ||||
| | `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | ||||
| | `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | ||||
| | `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | ||||
| | `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||
| | `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||
| | `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||
| | `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||
| | `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||
| | `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||
| | `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||
| | `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||
| 
 | ||||
| ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} | ||||
| 
 | ||||
|  | @ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline") | |||
| 
 | ||||
| The saved pipeline now includes the `"entity_ruler"` in its | ||||
| [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a | ||||
| file `entityruler.jsonl` with the patterns. When you load the pipeline back in, | ||||
| all pipeline components will be restored and deserialized – including the entity | ||||
| file `patterns.jsonl` with the patterns. When you load the pipeline back in, all | ||||
| pipeline components will be restored and deserialized – including the entity | ||||
| ruler. This lets you ship powerful pipeline packages with binary weights _and_ | ||||
| rules included! | ||||
| 
 | ||||
|  |  | |||
|  | @ -58,12 +58,12 @@ arcs. | |||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| | Argument  | Description                                                                               | | ||||
| | --------- | ----------------------------------------------------------------------------------------- | | ||||
| | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | | ||||
| | `color`   | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                    | | ||||
| | `bg`      | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~              | | ||||
| | `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                     | | ||||
| | Argument  | Description                                                                                                                                                                                                                                   | | ||||
| | --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     | | ||||
| | `color`   | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       | | ||||
| | `bg`      | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | | ||||
| | `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         | | ||||
| 
 | ||||
| For a list of all available options, see the | ||||
| [`displacy` API documentation](/api/top-level#displacy_options). | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user