mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'master' into sync/master-into-v4
# Conflicts: # requirements.txt # spacy/pipeline/entity_linker.py # spacy/util.py # website/docs/api/entitylinker.mdx
This commit is contained in:
		
						commit
						1ea31552be
					
				
							
								
								
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							|  | @ -16,7 +16,7 @@ jobs: | ||||||
|         with: |         with: | ||||||
|             ref: ${{ github.head_ref }} |             ref: ${{ github.head_ref }} | ||||||
|       - uses: actions/setup-python@v4 |       - uses: actions/setup-python@v4 | ||||||
|       - run: pip install black |       - run: pip install black -c requirements.txt | ||||||
|       - name: Auto-format code if needed |       - name: Auto-format code if needed | ||||||
|         run: black spacy |         run: black spacy | ||||||
|       # We can't run black --check here because that returns a non-zero excit |       # We can't run black --check here because that returns a non-zero excit | ||||||
|  |  | ||||||
|  | @ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its | ||||||
| Python modules. If you've built spaCy from source, you'll already have both | Python modules. If you've built spaCy from source, you'll already have both | ||||||
| tools installed. | tools installed. | ||||||
| 
 | 
 | ||||||
|  | As a general rule of thumb, we use f-strings for any formatting of strings. | ||||||
|  | One exception are calls to Python's `logging` functionality. | ||||||
|  | To avoid unnecessary string conversions in these cases, we use string formatting | ||||||
|  | templates with `%s` and `%d` etc. | ||||||
|  | 
 | ||||||
| **⚠️ Note that formatting and linting is currently only possible for Python | **⚠️ Note that formatting and linting is currently only possible for Python | ||||||
| modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** | modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -41,7 +41,7 @@ jobs: | ||||||
|         inputs: |         inputs: | ||||||
|           versionSpec: "3.8" |           versionSpec: "3.8" | ||||||
|       - script: | |       - script: | | ||||||
|           pip install black==22.3.0 |           pip install black -c requirements.txt | ||||||
|           python -m black spacy --check |           python -m black spacy --check | ||||||
|         displayName: "black" |         displayName: "black" | ||||||
|       - script: | |       - script: | | ||||||
|  |  | ||||||
|  | @ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 | ||||||
| mock>=2.0.0,<3.0.0 | mock>=2.0.0,<3.0.0 | ||||||
| flake8>=3.8.0,<6.0.0 | flake8>=3.8.0,<6.0.0 | ||||||
| hypothesis>=3.27.0,<7.0.0 | hypothesis>=3.27.0,<7.0.0 | ||||||
| mypy>=0.990,<0.1000; platform_machine != "aarch64" | mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" | ||||||
|  | types-dataclasses>=0.1.3; python_version < "3.7" | ||||||
| types-mock>=0.1.1 | types-mock>=0.1.1 | ||||||
| types-setuptools>=57.0.0 | types-setuptools>=57.0.0 | ||||||
| types-requests | types-requests | ||||||
| types-setuptools>=57.0.0 | types-setuptools>=57.0.0 | ||||||
| black>=22.0,<23.0 | black==22.3.0 | ||||||
|  |  | ||||||
|  | @ -90,9 +90,9 @@ def parse_config_overrides( | ||||||
|     cli_overrides = _parse_overrides(args, is_cli=True) |     cli_overrides = _parse_overrides(args, is_cli=True) | ||||||
|     if cli_overrides: |     if cli_overrides: | ||||||
|         keys = [k for k in cli_overrides if k not in env_overrides] |         keys = [k for k in cli_overrides if k not in env_overrides] | ||||||
|         logger.debug(f"Config overrides from CLI: {keys}") |         logger.debug("Config overrides from CLI: %s", keys) | ||||||
|     if env_overrides: |     if env_overrides: | ||||||
|         logger.debug(f"Config overrides from env variables: {list(env_overrides)}") |         logger.debug("Config overrides from env variables: %s", list(env_overrides)) | ||||||
|     return {**cli_overrides, **env_overrides} |     return {**cli_overrides, **env_overrides} | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -252,7 +252,7 @@ def get_third_party_dependencies( | ||||||
|                     raise regerr from None |                     raise regerr from None | ||||||
|             module_name = func_info.get("module")  # type: ignore[attr-defined] |             module_name = func_info.get("module")  # type: ignore[attr-defined] | ||||||
|             if module_name:  # the code is part of a module, not a --code file |             if module_name:  # the code is part of a module, not a --code file | ||||||
|                 modules.add(func_info["module"].split(".")[0])  # type: ignore[index] |                 modules.add(func_info["module"].split(".")[0])  # type: ignore[union-attr] | ||||||
|     dependencies = [] |     dependencies = [] | ||||||
|     for module_name in modules: |     for module_name in modules: | ||||||
|         if module_name in distributions: |         if module_name in distributions: | ||||||
|  |  | ||||||
|  | @ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | ||||||
|     # in the list. |     # in the list. | ||||||
|     while commands: |     while commands: | ||||||
|         for i, cmd in enumerate(list(commands)): |         for i, cmd in enumerate(list(commands)): | ||||||
|             logger.debug(f"CMD: {cmd['name']}.") |             logger.debug("CMD: %s.", cmd["name"]) | ||||||
|             deps = [project_dir / dep for dep in cmd.get("deps", [])] |             deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||||
|             if all(dep.exists() for dep in deps): |             if all(dep.exists() for dep in deps): | ||||||
|                 cmd_hash = get_command_hash("", "", deps, cmd["script"]) |                 cmd_hash = get_command_hash("", "", deps, cmd["script"]) | ||||||
|                 for output_path in cmd.get("outputs", []): |                 for output_path in cmd.get("outputs", []): | ||||||
|                     url = storage.pull(output_path, command_hash=cmd_hash) |                     url = storage.pull(output_path, command_hash=cmd_hash) | ||||||
|                     logger.debug( |                     logger.debug( | ||||||
|                         f"URL: {url} for {output_path} with command hash {cmd_hash}" |                         "URL: %s for %s with command hash %s", | ||||||
|  |                         url, | ||||||
|  |                         output_path, | ||||||
|  |                         cmd_hash, | ||||||
|                     ) |                     ) | ||||||
|                     yield url, output_path |                     yield url, output_path | ||||||
| 
 | 
 | ||||||
|  | @ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): | ||||||
|                 commands.pop(i) |                 commands.pop(i) | ||||||
|                 break |                 break | ||||||
|             else: |             else: | ||||||
|                 logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") |                 logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) | ||||||
|         else: |         else: | ||||||
|             # If we didn't break the for loop, break the while loop. |             # If we didn't break the for loop, break the while loop. | ||||||
|             break |             break | ||||||
|  |  | ||||||
|  | @ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): | ||||||
|         remote = config["remotes"][remote] |         remote = config["remotes"][remote] | ||||||
|     storage = RemoteStorage(project_dir, remote) |     storage = RemoteStorage(project_dir, remote) | ||||||
|     for cmd in config.get("commands", []): |     for cmd in config.get("commands", []): | ||||||
|         logger.debug(f"CMD: cmd['name']") |         logger.debug("CMD: %s", cmd["name"]) | ||||||
|         deps = [project_dir / dep for dep in cmd.get("deps", [])] |         deps = [project_dir / dep for dep in cmd.get("deps", [])] | ||||||
|         if any(not dep.exists() for dep in deps): |         if any(not dep.exists() for dep in deps): | ||||||
|             logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") |             logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) | ||||||
|             continue |             continue | ||||||
|         cmd_hash = get_command_hash( |         cmd_hash = get_command_hash( | ||||||
|             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] |             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] | ||||||
|         ) |         ) | ||||||
|         logger.debug(f"CMD_HASH: {cmd_hash}") |         logger.debug("CMD_HASH: %s", cmd_hash) | ||||||
|         for output_path in cmd.get("outputs", []): |         for output_path in cmd.get("outputs", []): | ||||||
|             output_loc = project_dir / output_path |             output_loc = project_dir / output_path | ||||||
|             if output_loc.exists() and _is_not_empty_dir(output_loc): |             if output_loc.exists() and _is_not_empty_dir(output_loc): | ||||||
|  | @ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): | ||||||
|                     content_hash=get_content_hash(output_loc), |                     content_hash=get_content_hash(output_loc), | ||||||
|                 ) |                 ) | ||||||
|                 logger.debug( |                 logger.debug( | ||||||
|                     f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" |                     "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash | ||||||
|                 ) |                 ) | ||||||
|                 yield output_path, url |                 yield output_path, url | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|     E133 = ("The sum of prior probabilities for alias '{alias}' should not " |     E133 = ("The sum of prior probabilities for alias '{alias}' should not " | ||||||
|             "exceed 1, but found {sum}.") |             "exceed 1, but found {sum}.") | ||||||
|     E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") |     E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") | ||||||
|     E139 = ("Knowledge base for component '{name}' is empty. Use the methods " |     E139 = ("Knowledge base for component '{name}' is empty.") | ||||||
|             "`kb.add_entity` and `kb.add_alias` to add entries.") |  | ||||||
|     E140 = ("The list of entities, prior probabilities and entity vectors " |     E140 = ("The list of entities, prior probabilities and entity vectors " | ||||||
|             "should be of equal length.") |             "should be of equal length.") | ||||||
|     E141 = ("Entity vectors should be of length {required} instead of the " |     E141 = ("Entity vectors should be of length {required} instead of the " | ||||||
|  | @ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes): | ||||||
|     E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " |     E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " | ||||||
|              "with `displacy.serve(doc, port=port)`") |              "with `displacy.serve(doc, port=port)`") | ||||||
|     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " |     E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " | ||||||
|              "or use `auto_switch_port=True` to pick an available port automatically.") |              "or use `auto_select_port=True` to pick an available port automatically.") | ||||||
| 
 | 
 | ||||||
|     # v4 error strings |     # v4 error strings | ||||||
|     E4000 = ("Expected a Doc as input, but got: '{type}'") |     E4000 = ("Expected a Doc as input, but got: '{type}'") | ||||||
|  |  | ||||||
|  | @ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): | ||||||
|         self._alias_index = PreshMap(nr_aliases + 1) |         self._alias_index = PreshMap(nr_aliases + 1) | ||||||
|         self._aliases_table = alias_vec(nr_aliases + 1) |         self._aliases_table = alias_vec(nr_aliases + 1) | ||||||
| 
 | 
 | ||||||
|  |     def is_empty(self): | ||||||
|  |         return len(self) == 0 | ||||||
|  | 
 | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|         return self.get_size_entities() |         return self.get_size_entities() | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS | ||||||
| from .syntax_iterators import SYNTAX_ITERATORS | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
| from ...language import Language, BaseDefaults | from ...language import Language, BaseDefaults | ||||||
| from ...pipeline import Lemmatizer | from ...pipeline import Lemmatizer | ||||||
| 
 | from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES | ||||||
| 
 |  | ||||||
| # Punctuation stolen from Danish |  | ||||||
| from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SwedishDefaults(BaseDefaults): | class SwedishDefaults(BaseDefaults): | ||||||
|  |  | ||||||
							
								
								
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,33 @@ | ||||||
|  | from ..char_classes import LIST_ELLIPSES, LIST_ICONS | ||||||
|  | from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER | ||||||
|  | from ..punctuation import TOKENIZER_SUFFIXES | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _quotes = CONCAT_QUOTES.replace("'", "") | ||||||
|  | 
 | ||||||
|  | _infixes = ( | ||||||
|  |     LIST_ELLIPSES | ||||||
|  |     + LIST_ICONS | ||||||
|  |     + [ | ||||||
|  |         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), | ||||||
|  |         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), | ||||||
|  |         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), | ||||||
|  |         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), | ||||||
|  |         r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), | ||||||
|  |     ] | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | _suffixes = [ | ||||||
|  |     suffix | ||||||
|  |     for suffix in TOKENIZER_SUFFIXES | ||||||
|  |     if suffix not in ["'s", "'S", "’s", "’S", r"\'"] | ||||||
|  | ] | ||||||
|  | _suffixes += [r"(?<=[^sSxXzZ])\'"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | TOKENIZER_INFIXES = _infixes | ||||||
|  | TOKENIZER_SUFFIXES = _suffixes | ||||||
|  | @ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: | ||||||
| 
 | 
 | ||||||
| @registry.misc("spacy.LookupsDataLoader.v1") | @registry.misc("spacy.LookupsDataLoader.v1") | ||||||
| def load_lookups_data(lang, tables): | def load_lookups_data(lang, tables): | ||||||
|     util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") |     util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables) | ||||||
|     lookups = load_lookups(lang=lang, tables=tables) |     lookups = load_lookups(lang=lang, tables=tables) | ||||||
|     return lookups |     return lookups | ||||||
| 
 | 
 | ||||||
|  | @ -2072,7 +2072,7 @@ class Language: | ||||||
|         pipe = self.get_pipe(pipe_name) |         pipe = self.get_pipe(pipe_name) | ||||||
|         pipe_cfg = self._pipe_configs[pipe_name] |         pipe_cfg = self._pipe_configs[pipe_name] | ||||||
|         if listeners: |         if listeners: | ||||||
|             util.logger.debug(f"Replacing listeners of component '{pipe_name}'") |             util.logger.debug("Replacing listeners of component '%s'", pipe_name) | ||||||
|             if len(list(listeners)) != len(pipe_listeners): |             if len(list(listeners)) != len(pipe_listeners): | ||||||
|                 # The number of listeners defined in the component model doesn't |                 # The number of listeners defined in the component model doesn't | ||||||
|                 # match the listeners to replace, so we won't be able to update |                 # match the listeners to replace, so we won't be able to update | ||||||
|  |  | ||||||
|  | @ -82,8 +82,12 @@ cdef class DependencyMatcher: | ||||||
|             "$-": self._imm_left_sib, |             "$-": self._imm_left_sib, | ||||||
|             "$++": self._right_sib, |             "$++": self._right_sib, | ||||||
|             "$--": self._left_sib, |             "$--": self._left_sib, | ||||||
|  |             ">+": self._imm_right_child, | ||||||
|  |             ">-": self._imm_left_child, | ||||||
|             ">++": self._right_child, |             ">++": self._right_child, | ||||||
|             ">--": self._left_child, |             ">--": self._left_child, | ||||||
|  |             "<+": self._imm_right_parent, | ||||||
|  |             "<-": self._imm_left_parent, | ||||||
|             "<++": self._right_parent, |             "<++": self._right_parent, | ||||||
|             "<--": self._left_parent, |             "<--": self._left_parent, | ||||||
|         } |         } | ||||||
|  | @ -427,12 +431,34 @@ cdef class DependencyMatcher: | ||||||
|     def _left_sib(self, doc, node): |     def _left_sib(self, doc, node): | ||||||
|         return [doc[child.i] for child in doc[node].head.children if child.i < node] |         return [doc[child.i] for child in doc[node].head.children if child.i < node] | ||||||
| 
 | 
 | ||||||
|  |     def _imm_right_child(self, doc, node): | ||||||
|  |         for child in doc[node].children: | ||||||
|  |             if child.i == node + 1: | ||||||
|  |                 return [doc[child.i]] | ||||||
|  |         return [] | ||||||
|  | 
 | ||||||
|  |     def _imm_left_child(self, doc, node): | ||||||
|  |         for child in doc[node].children: | ||||||
|  |             if child.i == node - 1: | ||||||
|  |                 return [doc[child.i]] | ||||||
|  |         return [] | ||||||
|  | 
 | ||||||
|     def _right_child(self, doc, node): |     def _right_child(self, doc, node): | ||||||
|         return [doc[child.i] for child in doc[node].children if child.i > node] |         return [doc[child.i] for child in doc[node].children if child.i > node] | ||||||
|      |      | ||||||
|     def _left_child(self, doc, node): |     def _left_child(self, doc, node): | ||||||
|         return [doc[child.i] for child in doc[node].children if child.i < node] |         return [doc[child.i] for child in doc[node].children if child.i < node] | ||||||
| 
 | 
 | ||||||
|  |     def _imm_right_parent(self, doc, node): | ||||||
|  |         if doc[node].head.i == node + 1: | ||||||
|  |             return [doc[node].head] | ||||||
|  |         return [] | ||||||
|  | 
 | ||||||
|  |     def _imm_left_parent(self, doc, node): | ||||||
|  |         if doc[node].head.i == node - 1: | ||||||
|  |             return [doc[node].head] | ||||||
|  |         return [] | ||||||
|  | 
 | ||||||
|     def _right_parent(self, doc, node): |     def _right_parent(self, doc, node): | ||||||
|         if doc[node].head.i > node: |         if doc[node].head.i > node: | ||||||
|             return [doc[node].head] |             return [doc[node].head] | ||||||
|  |  | ||||||
|  | @ -829,6 +829,11 @@ def _get_attr_values(spec, string_store): | ||||||
|     return attr_values |     return attr_values | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): | ||||||
|  |     # tuple order affects performance | ||||||
|  |     return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # These predicate helper classes are used to match the REGEX, IN, >= etc | # These predicate helper classes are used to match the REGEX, IN, >= etc | ||||||
| # extensions to the matcher introduced in #3173. | # extensions to the matcher introduced in #3173. | ||||||
| 
 | 
 | ||||||
|  | @ -848,7 +853,7 @@ class _FuzzyPredicate: | ||||||
|         fuzz = self.predicate[len("FUZZY"):] # number after prefix |         fuzz = self.predicate[len("FUZZY"):] # number after prefix | ||||||
|         self.fuzzy = int(fuzz) if fuzz else -1 |         self.fuzzy = int(fuzz) if fuzz else -1 | ||||||
|         self.fuzzy_compare = fuzzy_compare |         self.fuzzy_compare = fuzzy_compare | ||||||
|         self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) |         self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, Token token): |     def __call__(self, Token token): | ||||||
|         if self.is_extension: |         if self.is_extension: | ||||||
|  | @ -870,7 +875,7 @@ class _RegexPredicate: | ||||||
|         self.value = re.compile(value) |         self.value = re.compile(value) | ||||||
|         self.predicate = predicate |         self.predicate = predicate | ||||||
|         self.is_extension = is_extension |         self.is_extension = is_extension | ||||||
|         self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) |         self.key = _predicate_cache_key(self.attr, self.predicate, value) | ||||||
|         if self.predicate not in self.operators: |         if self.predicate not in self.operators: | ||||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) |             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||||
| 
 | 
 | ||||||
|  | @ -906,7 +911,7 @@ class _SetPredicate: | ||||||
|                 self.value = set(get_string_id(v) for v in value) |                 self.value = set(get_string_id(v) for v in value) | ||||||
|         self.predicate = predicate |         self.predicate = predicate | ||||||
|         self.is_extension = is_extension |         self.is_extension = is_extension | ||||||
|         self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) |         self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) | ||||||
|         if self.predicate not in self.operators: |         if self.predicate not in self.operators: | ||||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) |             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||||
| 
 | 
 | ||||||
|  | @ -978,7 +983,7 @@ class _ComparisonPredicate: | ||||||
|         self.value = value |         self.value = value | ||||||
|         self.predicate = predicate |         self.predicate = predicate | ||||||
|         self.is_extension = is_extension |         self.is_extension = is_extension | ||||||
|         self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) |         self.key = _predicate_cache_key(self.attr, self.predicate, value) | ||||||
|         if self.predicate not in self.operators: |         if self.predicate not in self.operators: | ||||||
|             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) |             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) | ||||||
| 
 | 
 | ||||||
|  | @ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, | ||||||
|         if isinstance(value, dict): |         if isinstance(value, dict): | ||||||
|             for type_, cls in predicate_types.items(): |             for type_, cls in predicate_types.items(): | ||||||
|                 if type_ in value: |                 if type_ in value: | ||||||
|                     key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) |                     key = _predicate_cache_key(attr, type_, value[type_]) | ||||||
|                     if key in seen_predicates: |                     if key in seen_predicates: | ||||||
|                         output.append(seen_predicates[key]) |                         output.append(seen_predicates[key]) | ||||||
|                     else: |                     else: | ||||||
|  |  | ||||||
|  | @ -89,6 +89,14 @@ def load_kb( | ||||||
|     return kb_from_file |     return kb_from_file | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @registry.misc("spacy.EmptyKB.v2") | ||||||
|  | def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: | ||||||
|  |     def empty_kb_factory(vocab: Vocab, entity_vector_length: int): | ||||||
|  |         return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) | ||||||
|  | 
 | ||||||
|  |     return empty_kb_factory | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @registry.misc("spacy.EmptyKB.v1") | @registry.misc("spacy.EmptyKB.v1") | ||||||
| def empty_kb( | def empty_kb( | ||||||
|     entity_vector_length: int, |     entity_vector_length: int, | ||||||
|  |  | ||||||
|  | @ -58,6 +58,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] | ||||||
|         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, |         "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, | ||||||
|         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, |         "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, | ||||||
|         "overwrite": False, |         "overwrite": False, | ||||||
|  |         "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, | ||||||
|         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, |         "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, | ||||||
|         "use_gold_ents": True, |         "use_gold_ents": True, | ||||||
|         "candidates_batch_size": 1, |         "candidates_batch_size": 1, | ||||||
|  | @ -84,6 +85,7 @@ def make_entity_linker( | ||||||
|     get_candidates_batch: Callable[ |     get_candidates_batch: Callable[ | ||||||
|         [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] |         [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] | ||||||
|     ], |     ], | ||||||
|  |     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], | ||||||
|     overwrite: bool, |     overwrite: bool, | ||||||
|     scorer: Optional[Callable], |     scorer: Optional[Callable], | ||||||
|     use_gold_ents: bool, |     use_gold_ents: bool, | ||||||
|  | @ -106,6 +108,7 @@ def make_entity_linker( | ||||||
|     get_candidates_batch ( |     get_candidates_batch ( | ||||||
|         Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] |         Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] | ||||||
|         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. |         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. | ||||||
|  |     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. | ||||||
|     scorer (Optional[Callable]): The scoring method. |     scorer (Optional[Callable]): The scoring method. | ||||||
|     use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another |     use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another | ||||||
|         component must provide entity annotations. |         component must provide entity annotations. | ||||||
|  | @ -147,6 +150,7 @@ def make_entity_linker( | ||||||
|         entity_vector_length=entity_vector_length, |         entity_vector_length=entity_vector_length, | ||||||
|         get_candidates=get_candidates, |         get_candidates=get_candidates, | ||||||
|         get_candidates_batch=get_candidates_batch, |         get_candidates_batch=get_candidates_batch, | ||||||
|  |         generate_empty_kb=generate_empty_kb, | ||||||
|         overwrite=overwrite, |         overwrite=overwrite, | ||||||
|         scorer=scorer, |         scorer=scorer, | ||||||
|         use_gold_ents=use_gold_ents, |         use_gold_ents=use_gold_ents, | ||||||
|  | @ -188,6 +192,7 @@ class EntityLinker(TrainablePipe): | ||||||
|         get_candidates_batch: Callable[ |         get_candidates_batch: Callable[ | ||||||
|             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] |             [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] | ||||||
|         ], |         ], | ||||||
|  |         generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], | ||||||
|         overwrite: bool = False, |         overwrite: bool = False, | ||||||
|         scorer: Optional[Callable] = entity_linker_score, |         scorer: Optional[Callable] = entity_linker_score, | ||||||
|         use_gold_ents: bool, |         use_gold_ents: bool, | ||||||
|  | @ -212,6 +217,7 @@ class EntityLinker(TrainablePipe): | ||||||
|             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], |             Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], | ||||||
|             Iterable[Candidate]] |             Iterable[Candidate]] | ||||||
|             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. |             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. | ||||||
|  |         generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. | ||||||
|         overwrite (bool): Whether to overwrite existing non-empty annotations. |         overwrite (bool): Whether to overwrite existing non-empty annotations. | ||||||
|         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. |         scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. | ||||||
|         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another |         use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another | ||||||
|  | @ -219,6 +225,7 @@ class EntityLinker(TrainablePipe): | ||||||
|         candidates_batch_size (int): Size of batches for entity candidate generation. |         candidates_batch_size (int): Size of batches for entity candidate generation. | ||||||
|         threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the |         threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the | ||||||
|             threshold, prediction is discarded. If None, predictions are not filtered by any threshold. |             threshold, prediction is discarded. If None, predictions are not filtered by any threshold. | ||||||
|  |         save_activations (bool): save model activations in Doc when annotating. | ||||||
|         DOCS: https://spacy.io/api/entitylinker#init |         DOCS: https://spacy.io/api/entitylinker#init | ||||||
|         """ |         """ | ||||||
| 
 | 
 | ||||||
|  | @ -235,6 +242,7 @@ class EntityLinker(TrainablePipe): | ||||||
|         self.model = model |         self.model = model | ||||||
|         self.name = name |         self.name = name | ||||||
|         self.labels_discard = list(labels_discard) |         self.labels_discard = list(labels_discard) | ||||||
|  |         # how many neighbour sentences to take into account | ||||||
|         self.n_sents = n_sents |         self.n_sents = n_sents | ||||||
|         self.incl_prior = incl_prior |         self.incl_prior = incl_prior | ||||||
|         self.incl_context = incl_context |         self.incl_context = incl_context | ||||||
|  | @ -242,9 +250,7 @@ class EntityLinker(TrainablePipe): | ||||||
|         self.get_candidates_batch = get_candidates_batch |         self.get_candidates_batch = get_candidates_batch | ||||||
|         self.cfg: Dict[str, Any] = {"overwrite": overwrite} |         self.cfg: Dict[str, Any] = {"overwrite": overwrite} | ||||||
|         self.distance = CosineDistance(normalize=False) |         self.distance = CosineDistance(normalize=False) | ||||||
|         # how many neighbour sentences to take into account |         self.kb = generate_empty_kb(self.vocab, entity_vector_length) | ||||||
|         # create an empty KB by default |  | ||||||
|         self.kb = empty_kb(entity_vector_length)(self.vocab) |  | ||||||
|         self.scorer = scorer |         self.scorer = scorer | ||||||
|         self.use_gold_ents = use_gold_ents |         self.use_gold_ents = use_gold_ents | ||||||
|         self.candidates_batch_size = candidates_batch_size |         self.candidates_batch_size = candidates_batch_size | ||||||
|  | @ -266,7 +272,7 @@ class EntityLinker(TrainablePipe): | ||||||
|         # Raise an error if the knowledge base is not initialized. |         # Raise an error if the knowledge base is not initialized. | ||||||
|         if self.kb is None: |         if self.kb is None: | ||||||
|             raise ValueError(Errors.E1018.format(name=self.name)) |             raise ValueError(Errors.E1018.format(name=self.name)) | ||||||
|         if len(self.kb) == 0: |         if hasattr(self.kb, "is_empty") and self.kb.is_empty(): | ||||||
|             raise ValueError(Errors.E139.format(name=self.name)) |             raise ValueError(Errors.E139.format(name=self.name)) | ||||||
| 
 | 
 | ||||||
|     def initialize( |     def initialize( | ||||||
|  |  | ||||||
|  | @ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text): | ||||||
| def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): | def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text): | ||||||
|     tokens = sv_tokenizer(text) |     tokens = sv_tokenizer(text) | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.issue(12311) | ||||||
|  | @pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"]) | ||||||
|  | def test_sv_tokenizer_handles_colon(sv_tokenizer, text): | ||||||
|  |     tokens = sv_tokenizer(text) | ||||||
|  |     assert len(tokens) == 1 | ||||||
|  |  | ||||||
|  | @ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): | ||||||
|         ("the", "brown", "$--", 0), |         ("the", "brown", "$--", 0), | ||||||
|         ("brown", "the", "$--", 1), |         ("brown", "the", "$--", 1), | ||||||
|         ("brown", "brown", "$--", 0), |         ("brown", "brown", "$--", 0), | ||||||
|  |         ("over", "jumped", "<+", 0), | ||||||
|  |         ("quick", "fox", "<+", 0), | ||||||
|  |         ("the", "quick", "<+", 0), | ||||||
|  |         ("brown", "fox", "<+", 1), | ||||||
|         ("quick", "fox", "<++", 1), |         ("quick", "fox", "<++", 1), | ||||||
|         ("quick", "over", "<++", 0), |         ("quick", "over", "<++", 0), | ||||||
|         ("over", "jumped", "<++", 0), |         ("over", "jumped", "<++", 0), | ||||||
|         ("the", "fox", "<++", 2), |         ("the", "fox", "<++", 2), | ||||||
|  |         ("brown", "fox", "<-", 0), | ||||||
|  |         ("fox", "over", "<-", 0), | ||||||
|  |         ("the", "over", "<-", 0), | ||||||
|  |         ("over", "jumped", "<-", 1), | ||||||
|         ("brown", "fox", "<--", 0), |         ("brown", "fox", "<--", 0), | ||||||
|         ("fox", "jumped", "<--", 0), |         ("fox", "jumped", "<--", 0), | ||||||
|         ("fox", "over", "<--", 1), |         ("fox", "over", "<--", 1), | ||||||
|  |         ("fox", "brown", ">+", 0), | ||||||
|  |         ("over", "fox", ">+", 0), | ||||||
|  |         ("over", "the", ">+", 0), | ||||||
|  |         ("jumped", "over", ">+", 1), | ||||||
|         ("jumped", "over", ">++", 1), |         ("jumped", "over", ">++", 1), | ||||||
|         ("fox", "lazy", ">++", 0), |         ("fox", "lazy", ">++", 0), | ||||||
|         ("over", "the", ">++", 0), |         ("over", "the", ">++", 0), | ||||||
|  |         ("jumped", "over", ">-", 0), | ||||||
|  |         ("fox", "quick", ">-", 0), | ||||||
|  |         ("brown", "quick", ">-", 0), | ||||||
|  |         ("fox", "brown", ">-", 1), | ||||||
|         ("brown", "fox", ">--", 0), |         ("brown", "fox", ">--", 0), | ||||||
|         ("fox", "brown", ">--", 1), |         ("fox", "brown", ">--", 1), | ||||||
|         ("jumped", "fox", ">--", 1), |         ("jumped", "fox", ">--", 1), | ||||||
|  |  | ||||||
|  | @ -353,6 +353,9 @@ def test_kb_default(nlp): | ||||||
|     """Test that the default (empty) KB is loaded upon construction""" |     """Test that the default (empty) KB is loaded upon construction""" | ||||||
|     entity_linker = nlp.add_pipe("entity_linker", config={}) |     entity_linker = nlp.add_pipe("entity_linker", config={}) | ||||||
|     assert len(entity_linker.kb) == 0 |     assert len(entity_linker.kb) == 0 | ||||||
|  |     with pytest.raises(ValueError, match="E139"): | ||||||
|  |         # this raises an error because the KB is empty | ||||||
|  |         entity_linker.validate_kb() | ||||||
|     assert entity_linker.kb.get_size_entities() == 0 |     assert entity_linker.kb.get_size_entities() == 0 | ||||||
|     assert entity_linker.kb.get_size_aliases() == 0 |     assert entity_linker.kb.get_size_aliases() == 0 | ||||||
|     # 64 is the default value from pipeline.entity_linker |     # 64 is the default value from pipeline.entity_linker | ||||||
|  |  | ||||||
|  | @ -1,7 +1,10 @@ | ||||||
| from typing import Callable | from pathlib import Path | ||||||
|  | from typing import Callable, Iterable, Any, Dict | ||||||
| 
 | 
 | ||||||
| from spacy import util | import srsly | ||||||
| from spacy.util import ensure_path, registry, load_model_from_config | 
 | ||||||
|  | from spacy import util, Errors | ||||||
|  | from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList | ||||||
| from spacy.kb.kb_in_memory import InMemoryLookupKB | from spacy.kb.kb_in_memory import InMemoryLookupKB | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
| from thinc.api import Config | from thinc.api import Config | ||||||
|  | @ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): | ||||||
| 
 | 
 | ||||||
|     [components.entity_linker] |     [components.entity_linker] | ||||||
|     factory = "entity_linker" |     factory = "entity_linker" | ||||||
| 
 |      | ||||||
|  |     [components.entity_linker.generate_empty_kb] | ||||||
|  |     @misc = "kb_test.CustomEmptyKB.v1" | ||||||
|  |      | ||||||
|     [initialize] |     [initialize] | ||||||
| 
 | 
 | ||||||
|     [initialize.components] |     [initialize.components] | ||||||
|  | @ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): | ||||||
|     [initialize.components.entity_linker] |     [initialize.components.entity_linker] | ||||||
| 
 | 
 | ||||||
|     [initialize.components.entity_linker.kb_loader] |     [initialize.components.entity_linker.kb_loader] | ||||||
|     @misc = "spacy.CustomKB.v1" |     @misc = "kb_test.CustomKB.v1" | ||||||
|     entity_vector_length = 342 |     entity_vector_length = 342 | ||||||
|     custom_field = 666 |     custom_field = 666 | ||||||
|     """ |     """ | ||||||
|  | @ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): | ||||||
|             super().__init__(vocab, entity_vector_length) |             super().__init__(vocab, entity_vector_length) | ||||||
|             self.custom_field = custom_field |             self.custom_field = custom_field | ||||||
| 
 | 
 | ||||||
|     @registry.misc("spacy.CustomKB.v1") |         def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): | ||||||
|  |             """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" | ||||||
|  |             path = ensure_path(path) | ||||||
|  |             if not path.exists(): | ||||||
|  |                 path.mkdir(parents=True) | ||||||
|  |             if not path.is_dir(): | ||||||
|  |                 raise ValueError(Errors.E928.format(loc=path)) | ||||||
|  | 
 | ||||||
|  |             def serialize_custom_fields(file_path: Path) -> None: | ||||||
|  |                 srsly.write_json(file_path, {"custom_field": self.custom_field}) | ||||||
|  | 
 | ||||||
|  |             serialize = { | ||||||
|  |                 "contents": lambda p: self.write_contents(p), | ||||||
|  |                 "strings.json": lambda p: self.vocab.strings.to_disk(p), | ||||||
|  |                 "custom_fields": lambda p: serialize_custom_fields(p), | ||||||
|  |             } | ||||||
|  |             util.to_disk(path, serialize, exclude) | ||||||
|  | 
 | ||||||
|  |         def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): | ||||||
|  |             """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" | ||||||
|  |             path = ensure_path(path) | ||||||
|  |             if not path.exists(): | ||||||
|  |                 raise ValueError(Errors.E929.format(loc=path)) | ||||||
|  |             if not path.is_dir(): | ||||||
|  |                 raise ValueError(Errors.E928.format(loc=path)) | ||||||
|  | 
 | ||||||
|  |             def deserialize_custom_fields(file_path: Path) -> None: | ||||||
|  |                 self.custom_field = srsly.read_json(file_path)["custom_field"] | ||||||
|  | 
 | ||||||
|  |             deserialize: Dict[str, Callable[[Any], Any]] = { | ||||||
|  |                 "contents": lambda p: self.read_contents(p), | ||||||
|  |                 "strings.json": lambda p: self.vocab.strings.from_disk(p), | ||||||
|  |                 "custom_fields": lambda p: deserialize_custom_fields(p), | ||||||
|  |             } | ||||||
|  |             util.from_disk(path, deserialize, exclude) | ||||||
|  | 
 | ||||||
|  |     @registry.misc("kb_test.CustomEmptyKB.v1") | ||||||
|  |     def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: | ||||||
|  |         def empty_kb_factory(vocab: Vocab, entity_vector_length: int): | ||||||
|  |             return SubInMemoryLookupKB( | ||||||
|  |                 vocab=vocab, | ||||||
|  |                 entity_vector_length=entity_vector_length, | ||||||
|  |                 custom_field=0, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         return empty_kb_factory | ||||||
|  | 
 | ||||||
|  |     @registry.misc("kb_test.CustomKB.v1") | ||||||
|     def custom_kb( |     def custom_kb( | ||||||
|         entity_vector_length: int, custom_field: int |         entity_vector_length: int, custom_field: int | ||||||
|     ) -> Callable[[Vocab], InMemoryLookupKB]: |     ) -> Callable[[Vocab], SubInMemoryLookupKB]: | ||||||
|         def custom_kb_factory(vocab): |         def custom_kb_factory(vocab): | ||||||
|             kb = SubInMemoryLookupKB( |             kb = SubInMemoryLookupKB( | ||||||
|                 vocab=vocab, |                 vocab=vocab, | ||||||
|  | @ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): | ||||||
|         nlp2 = util.load_model_from_path(tmp_dir) |         nlp2 = util.load_model_from_path(tmp_dir) | ||||||
|         entity_linker2 = nlp2.get_pipe("entity_linker") |         entity_linker2 = nlp2.get_pipe("entity_linker") | ||||||
|         # After IO, the KB is the standard one |         # After IO, the KB is the standard one | ||||||
|         assert type(entity_linker2.kb) == InMemoryLookupKB |         assert type(entity_linker2.kb) == SubInMemoryLookupKB | ||||||
|         assert entity_linker2.kb.entity_vector_length == 342 |         assert entity_linker2.kb.entity_vector_length == 342 | ||||||
|         assert not hasattr(entity_linker2.kb, "custom_field") |         assert entity_linker2.kb.custom_field == 666 | ||||||
|  |  | ||||||
|  | @ -1,5 +1,7 @@ | ||||||
| import os | import os | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | import pytest | ||||||
|  | import srsly | ||||||
| from typer.testing import CliRunner | from typer.testing import CliRunner | ||||||
| from spacy.tokens import DocBin, Doc | from spacy.tokens import DocBin, Doc | ||||||
| 
 | 
 | ||||||
|  | @ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab): | ||||||
|         # Instead of checking specific wording of the output, which may change, |         # Instead of checking specific wording of the output, which may change, | ||||||
|         # we'll check that this section of the debug output is present. |         # we'll check that this section of the debug output is present. | ||||||
|         assert "= Trainable Lemmatizer =" in result_debug_data.stdout |         assert "= Trainable Lemmatizer =" in result_debug_data.stdout | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # project tests | ||||||
|  | 
 | ||||||
|  | SAMPLE_PROJECT = { | ||||||
|  |     "title": "Sample project", | ||||||
|  |     "description": "This is a project for testing", | ||||||
|  |     "assets": [ | ||||||
|  |         { | ||||||
|  |             "dest": "assets/spacy-readme.md", | ||||||
|  |             "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", | ||||||
|  |             "checksum": "411b2c89ccf34288fae8ed126bf652f7", | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |             "dest": "assets/citation.cff", | ||||||
|  |             "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", | ||||||
|  |             "checksum": "c996bfd80202d480eb2e592369714e5e", | ||||||
|  |             "extra": True, | ||||||
|  |         }, | ||||||
|  |     ], | ||||||
|  |     "commands": [ | ||||||
|  |         { | ||||||
|  |             "name": "ok", | ||||||
|  |             "help": "print ok", | ||||||
|  |             "script": ["python -c \"print('okokok')\""], | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |             "name": "create", | ||||||
|  |             "help": "make a file", | ||||||
|  |             "script": ["touch abc.txt"], | ||||||
|  |             "outputs": ["abc.txt"], | ||||||
|  |         }, | ||||||
|  |         { | ||||||
|  |             "name": "clean", | ||||||
|  |             "help": "remove test file", | ||||||
|  |             "script": ["rm abc.txt"], | ||||||
|  |         }, | ||||||
|  |     ], | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def project_dir(): | ||||||
|  |     with make_tempdir() as pdir: | ||||||
|  |         (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) | ||||||
|  |         yield pdir | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_project_document(project_dir): | ||||||
|  |     readme_path = project_dir / "README.md" | ||||||
|  |     assert not readme_path.exists(), "README already exists" | ||||||
|  |     result = CliRunner().invoke( | ||||||
|  |         app, ["project", "document", str(project_dir), "-o", str(readme_path)] | ||||||
|  |     ) | ||||||
|  |     assert result.exit_code == 0 | ||||||
|  |     assert readme_path.is_file() | ||||||
|  |     text = readme_path.read_text("utf-8") | ||||||
|  |     assert SAMPLE_PROJECT["description"] in text | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_project_assets(project_dir): | ||||||
|  |     asset_dir = project_dir / "assets" | ||||||
|  |     assert not asset_dir.exists(), "Assets dir is already present" | ||||||
|  |     result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) | ||||||
|  |     assert result.exit_code == 0 | ||||||
|  |     assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" | ||||||
|  |     # check that extras work | ||||||
|  |     result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) | ||||||
|  |     assert result.exit_code == 0 | ||||||
|  |     assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_project_run(project_dir): | ||||||
|  |     # make sure dry run works | ||||||
|  |     test_file = project_dir / "abc.txt" | ||||||
|  |     result = CliRunner().invoke( | ||||||
|  |         app, ["project", "run", "--dry", "create", str(project_dir)] | ||||||
|  |     ) | ||||||
|  |     assert result.exit_code == 0 | ||||||
|  |     assert not test_file.is_file() | ||||||
|  |     result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) | ||||||
|  |     assert result.exit_code == 0 | ||||||
|  |     assert test_file.is_file() | ||||||
|  |     result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) | ||||||
|  |     assert result.exit_code == 0 | ||||||
|  |     assert "okokok" in result.stdout | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize( | ||||||
|  |     "options", | ||||||
|  |     [ | ||||||
|  |         "", | ||||||
|  |         # "--sparse", | ||||||
|  |         "--branch v3", | ||||||
|  |         "--repo https://github.com/explosion/projects --branch v3", | ||||||
|  |     ], | ||||||
|  | ) | ||||||
|  | def test_project_clone(options): | ||||||
|  |     with make_tempdir() as workspace: | ||||||
|  |         out = workspace / "project" | ||||||
|  |         target = "benchmarks/ner_conll03" | ||||||
|  |         if not options: | ||||||
|  |             options = [] | ||||||
|  |         else: | ||||||
|  |             options = options.split() | ||||||
|  |         result = CliRunner().invoke( | ||||||
|  |             app, ["project", "clone", target, *options, str(out)] | ||||||
|  |         ) | ||||||
|  |         assert result.exit_code == 0 | ||||||
|  |         assert (out / "README.md").is_file() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_project_push_pull(project_dir): | ||||||
|  |     proj = dict(SAMPLE_PROJECT) | ||||||
|  |     remote = "xyz" | ||||||
|  | 
 | ||||||
|  |     with make_tempdir() as remote_dir: | ||||||
|  |         proj["remotes"] = {remote: str(remote_dir)} | ||||||
|  |         proj_text = srsly.yaml_dumps(proj) | ||||||
|  |         (project_dir / "project.yml").write_text(proj_text) | ||||||
|  | 
 | ||||||
|  |         test_file = project_dir / "abc.txt" | ||||||
|  |         result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) | ||||||
|  |         assert result.exit_code == 0 | ||||||
|  |         assert test_file.is_file() | ||||||
|  |         result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) | ||||||
|  |         assert result.exit_code == 0 | ||||||
|  |         result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) | ||||||
|  |         assert result.exit_code == 0 | ||||||
|  |         assert not test_file.exists() | ||||||
|  |         result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) | ||||||
|  |         assert result.exit_code == 0 | ||||||
|  |         assert test_file.is_file() | ||||||
|  |  | ||||||
|  | @ -98,7 +98,7 @@ def assert_sents_error(doc): | ||||||
| 
 | 
 | ||||||
| def warn_error(proc_name, proc, docs, e): | def warn_error(proc_name, proc, docs, e): | ||||||
|     logger = logging.getLogger("spacy") |     logger = logging.getLogger("spacy") | ||||||
|     logger.warning(f"Trouble with component {proc_name}.") |     logger.warning("Trouble with component %s.", proc_name) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  |  | ||||||
|  | @ -11,7 +11,7 @@ def create_copy_from_base_model( | ||||||
| ) -> Callable[[Language], Language]: | ) -> Callable[[Language], Language]: | ||||||
|     def copy_from_base_model(nlp): |     def copy_from_base_model(nlp): | ||||||
|         if tokenizer: |         if tokenizer: | ||||||
|             logger.info(f"Copying tokenizer from: {tokenizer}") |             logger.info("Copying tokenizer from: %s", tokenizer) | ||||||
|             base_nlp = load_model(tokenizer) |             base_nlp = load_model(tokenizer) | ||||||
|             if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: |             if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: | ||||||
|                 nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) |                 nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) | ||||||
|  | @ -23,7 +23,7 @@ def create_copy_from_base_model( | ||||||
|                     ) |                     ) | ||||||
|                 ) |                 ) | ||||||
|         if vocab: |         if vocab: | ||||||
|             logger.info(f"Copying vocab from: {vocab}") |             logger.info("Copying vocab from: %s", vocab) | ||||||
|             # only reload if the vocab is from a different model |             # only reload if the vocab is from a different model | ||||||
|             if tokenizer != vocab: |             if tokenizer != vocab: | ||||||
|                 base_nlp = load_model(vocab) |                 base_nlp = load_model(vocab) | ||||||
|  |  | ||||||
|  | @ -29,7 +29,7 @@ def create_docbin_reader( | ||||||
| ) -> Callable[["Language"], Iterable[Example]]: | ) -> Callable[["Language"], Iterable[Example]]: | ||||||
|     if path is None: |     if path is None: | ||||||
|         raise ValueError(Errors.E913) |         raise ValueError(Errors.E913) | ||||||
|     util.logger.debug(f"Loading corpus from path: {path}") |     util.logger.debug("Loading corpus from path: %s", path) | ||||||
|     return Corpus( |     return Corpus( | ||||||
|         path, |         path, | ||||||
|         gold_preproc=gold_preproc, |         gold_preproc=gold_preproc, | ||||||
|  |  | ||||||
|  | @ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | ||||||
|     frozen_components = T["frozen_components"] |     frozen_components = T["frozen_components"] | ||||||
|     # Sourced components that require resume_training |     # Sourced components that require resume_training | ||||||
|     resume_components = [p for p in sourced if p not in frozen_components] |     resume_components = [p for p in sourced if p not in frozen_components] | ||||||
|     logger.info(f"Pipeline: {nlp.pipe_names}") |     logger.info("Pipeline: %s", nlp.pipe_names) | ||||||
|     if resume_components: |     if resume_components: | ||||||
|         with nlp.select_pipes(enable=resume_components): |         with nlp.select_pipes(enable=resume_components): | ||||||
|             logger.info(f"Resuming training for: {resume_components}") |             logger.info("Resuming training for: %s", resume_components) | ||||||
|             nlp.resume_training(sgd=optimizer) |             nlp.resume_training(sgd=optimizer) | ||||||
|     # Make sure that listeners are defined before initializing further |     # Make sure that listeners are defined before initializing further | ||||||
|     nlp._link_components() |     nlp._link_components() | ||||||
|  | @ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | ||||||
|         if T["max_epochs"] == -1: |         if T["max_epochs"] == -1: | ||||||
|             sample_size = 100 |             sample_size = 100 | ||||||
|             logger.debug( |             logger.debug( | ||||||
|                 f"Due to streamed train corpus, using only first {sample_size} " |                 "Due to streamed train corpus, using only first %s examples for initialization. " | ||||||
|                 f"examples for initialization. If necessary, provide all labels " |                 "If necessary, provide all labels in [initialize]. " | ||||||
|                 f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" |                 "More info: https://spacy.io/api/cli#init_labels", | ||||||
|  |                 sample_size, | ||||||
|             ) |             ) | ||||||
|             nlp.initialize( |             nlp.initialize( | ||||||
|                 lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer |                 lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer | ||||||
|             ) |             ) | ||||||
|         else: |         else: | ||||||
|             nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) |             nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) | ||||||
|         logger.info(f"Initialized pipeline components: {nlp.pipe_names}") |         logger.info("Initialized pipeline components: %s", nlp.pipe_names) | ||||||
|     # Detect components with listeners that are not frozen consistently |     # Detect components with listeners that are not frozen consistently | ||||||
|     for name, proc in nlp.pipeline: |     for name, proc in nlp.pipeline: | ||||||
|         for listener in getattr( |         for listener in getattr( | ||||||
|  | @ -109,7 +110,7 @@ def init_vocab( | ||||||
| ) -> None: | ) -> None: | ||||||
|     if lookups: |     if lookups: | ||||||
|         nlp.vocab.lookups = lookups |         nlp.vocab.lookups = lookups | ||||||
|         logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") |         logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) | ||||||
|     data_path = ensure_path(data) |     data_path = ensure_path(data) | ||||||
|     if data_path is not None: |     if data_path is not None: | ||||||
|         lex_attrs = srsly.read_jsonl(data_path) |         lex_attrs = srsly.read_jsonl(data_path) | ||||||
|  | @ -125,11 +126,11 @@ def init_vocab( | ||||||
|         else: |         else: | ||||||
|             oov_prob = DEFAULT_OOV_PROB |             oov_prob = DEFAULT_OOV_PROB | ||||||
|         nlp.vocab.cfg.update({"oov_prob": oov_prob}) |         nlp.vocab.cfg.update({"oov_prob": oov_prob}) | ||||||
|         logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") |         logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) | ||||||
|     logger.info("Created vocabulary") |     logger.info("Created vocabulary") | ||||||
|     if vectors is not None: |     if vectors is not None: | ||||||
|         load_vectors_into_model(nlp, vectors) |         load_vectors_into_model(nlp, vectors) | ||||||
|         logger.info(f"Added vectors: {vectors}") |         logger.info("Added vectors: %s", vectors) | ||||||
|     # warn if source model vectors are not identical |     # warn if source model vectors are not identical | ||||||
|     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) |     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) | ||||||
|     vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) |     vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) | ||||||
|  | @ -191,7 +192,7 @@ def init_tok2vec( | ||||||
|     if weights_data is not None: |     if weights_data is not None: | ||||||
|         layer = get_tok2vec_ref(nlp, P) |         layer = get_tok2vec_ref(nlp, P) | ||||||
|         layer.from_bytes(weights_data) |         layer.from_bytes(weights_data) | ||||||
|         logger.info(f"Loaded pretrained weights from {init_tok2vec}") |         logger.info("Loaded pretrained weights from %s", init_tok2vec) | ||||||
|         return True |         return True | ||||||
|     return False |     return False | ||||||
| 
 | 
 | ||||||
|  | @ -215,13 +216,13 @@ def convert_vectors( | ||||||
|         nlp.vocab.deduplicate_vectors() |         nlp.vocab.deduplicate_vectors() | ||||||
|     else: |     else: | ||||||
|         if vectors_loc: |         if vectors_loc: | ||||||
|             logger.info(f"Reading vectors from {vectors_loc}") |             logger.info("Reading vectors from %s", vectors_loc) | ||||||
|             vectors_data, vector_keys, floret_settings = read_vectors( |             vectors_data, vector_keys, floret_settings = read_vectors( | ||||||
|                 vectors_loc, |                 vectors_loc, | ||||||
|                 truncate, |                 truncate, | ||||||
|                 mode=mode, |                 mode=mode, | ||||||
|             ) |             ) | ||||||
|             logger.info(f"Loaded vectors from {vectors_loc}") |             logger.info("Loaded vectors from %s", vectors_loc) | ||||||
|         else: |         else: | ||||||
|             vectors_data, vector_keys = (None, None) |             vectors_data, vector_keys = (None, None) | ||||||
|         if vector_keys is not None and mode != VectorsMode.floret: |         if vector_keys is not None and mode != VectorsMode.floret: | ||||||
|  |  | ||||||
|  | @ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None: | ||||||
|             if subdir.exists(): |             if subdir.exists(): | ||||||
|                 try: |                 try: | ||||||
|                     shutil.rmtree(str(subdir)) |                     shutil.rmtree(str(subdir)) | ||||||
|                     logger.debug(f"Removed existing output directory: {subdir}") |                     logger.debug("Removed existing output directory: %s", subdir) | ||||||
|                 except Exception as e: |                 except Exception as e: | ||||||
|                     raise IOError(Errors.E901.format(path=path)) from e |                     raise IOError(Errors.E901.format(path=path)) from e | ||||||
|  |  | ||||||
|  | @ -33,6 +33,7 @@ import inspect | ||||||
| import pkgutil | import pkgutil | ||||||
| import logging | import logging | ||||||
| import socket | import socket | ||||||
|  | import stat | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|     import cupy.random |     import cupy.random | ||||||
|  | @ -55,7 +56,7 @@ if TYPE_CHECKING: | ||||||
| # fmt: off | # fmt: off | ||||||
| OOV_RANK = numpy.iinfo(numpy.uint64).max | OOV_RANK = numpy.iinfo(numpy.uint64).max | ||||||
| DEFAULT_OOV_PROB = -20 | DEFAULT_OOV_PROB = -20 | ||||||
| LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] | LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] | ||||||
| 
 | 
 | ||||||
| # Default order of sections in the config file. Not all sections needs to exist, | # Default order of sections in the config file. Not all sections needs to exist, | ||||||
| # and additional sections are added at the end, in alphabetical order. | # and additional sections are added at the end, in alphabetical order. | ||||||
|  | @ -139,8 +140,17 @@ class registry(thinc.registry): | ||||||
|         return func |         return func | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def find(cls, registry_name: str, func_name: str) -> Callable: |     def find( | ||||||
|         """Get info about a registered function from the registry.""" |         cls, registry_name: str, func_name: str | ||||||
|  |     ) -> Dict[str, Optional[Union[str, int]]]: | ||||||
|  |         """Find information about a registered function, including the | ||||||
|  |         module and path to the file it's defined in, the line number and the | ||||||
|  |         docstring, if available. | ||||||
|  | 
 | ||||||
|  |         registry_name (str): Name of the catalogue registry. | ||||||
|  |         func_name (str): Name of the registered function. | ||||||
|  |         RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. | ||||||
|  |         """ | ||||||
|         # We're overwriting this classmethod so we're able to provide more |         # We're overwriting this classmethod so we're able to provide more | ||||||
|         # specific error messages and implement a fallback to spacy-legacy. |         # specific error messages and implement a fallback to spacy-legacy. | ||||||
|         if not hasattr(cls, registry_name): |         if not hasattr(cls, registry_name): | ||||||
|  | @ -1028,11 +1038,19 @@ def make_tempdir() -> Generator[Path, None, None]: | ||||||
| 
 | 
 | ||||||
|     YIELDS (Path): The path of the temp directory. |     YIELDS (Path): The path of the temp directory. | ||||||
|     """ |     """ | ||||||
|  |     d = Path(tempfile.mkdtemp()) | ||||||
|  |     yield d | ||||||
|  | 
 | ||||||
|  |     # On Windows, git clones use read-only files, which cause permission errors | ||||||
|  |     # when being deleted. This forcibly fixes permissions. | ||||||
|  |     def force_remove(rmfunc, path, ex): | ||||||
|  |         os.chmod(path, stat.S_IWRITE) | ||||||
|  |         rmfunc(path) | ||||||
|  | 
 | ||||||
|     try: |     try: | ||||||
|         with tempfile.TemporaryDirectory() as td: |         shutil.rmtree(str(d), onerror=force_remove) | ||||||
|             yield Path(td) |  | ||||||
|     except PermissionError as e: |     except PermissionError as e: | ||||||
|         warnings.warn(Warnings.W091.format(dir=td, msg=e)) |         warnings.warn(Warnings.W091.format(dir=d, msg=e)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_cwd(path: Union[Path, str]) -> bool: | def is_cwd(path: Union[Path, str]) -> bool: | ||||||
|  |  | ||||||
|  | @ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | ||||||
| | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | | `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
| | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    | | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    | | ||||||
| 
 | 
 | ||||||
| ### spacy.EmptyKB.v1 {id="EmptyKB"} | ### spacy.EmptyKB.v1 {id="EmptyKB.v1"} | ||||||
| 
 | 
 | ||||||
| A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) | A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) | ||||||
| instance. This is the default when a new entity linker component is created. | instance. | ||||||
| 
 | 
 | ||||||
| | Name                   | Description                                                                         | | | Name                   | Description                                                                         | | ||||||
| | ---------------------- | ----------------------------------------------------------------------------------- | | | ---------------------- | ----------------------------------------------------------------------------------- | | ||||||
| | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | | ||||||
| 
 | 
 | ||||||
|  | ### spacy.EmptyKB.v2 {id="EmptyKB"} | ||||||
|  | 
 | ||||||
|  | A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) | ||||||
|  | instance. This is the default when a new entity linker component is created. It | ||||||
|  | returns a `Callable[[Vocab, int], InMemoryLookupKB]`. | ||||||
|  | 
 | ||||||
| ### spacy.KBFromFile.v1 {id="KBFromFile"} | ### spacy.KBFromFile.v1 {id="KBFromFile"} | ||||||
| 
 | 
 | ||||||
| A function that reads an existing `KnowledgeBase` from file. | A function that reads an existing `KnowledgeBase` from file. | ||||||
|  | @ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default | ||||||
| `CandidateGenerator` uses the text of a mention to find its potential aliases in | `CandidateGenerator` uses the text of a mention to find its potential aliases in | ||||||
| the `KnowledgeBase`. Note that this function is case-dependent. | the `KnowledgeBase`. Note that this function is case-dependent. | ||||||
| 
 | 
 | ||||||
|  | ### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} | ||||||
|  | 
 | ||||||
|  | A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of | ||||||
|  | [`Span`](/api/span) objects denoting named entities, and returns a list of | ||||||
|  | plausible [`Candidate`](/api/kb/#candidate) objects per specified | ||||||
|  | [`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a | ||||||
|  | mention to find its potential aliases in the `KnowledgeBase`. Note that this | ||||||
|  | function is case-dependent. | ||||||
|  | 
 | ||||||
| ## Coreference {id="coref-architectures",tag="experimental"} | ## Coreference {id="coref-architectures",tag="experimental"} | ||||||
| 
 | 
 | ||||||
| A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to | A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to | ||||||
|  |  | ||||||
|  | @ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir] | ||||||
| ### project pull {id="project-pull",tag="command"} | ### project pull {id="project-pull",tag="command"} | ||||||
| 
 | 
 | ||||||
| Download all files or directories listed as `outputs` for commands, unless they | Download all files or directories listed as `outputs` for commands, unless they | ||||||
| are not already present locally. When searching for files in the remote, `pull` | are already present locally. When searching for files in the remote, `pull` | ||||||
| won't just look at the output path, but will also consider the **command | won't just look at the output path, but will also consider the **command | ||||||
| string** and the **hashes of the dependencies**. For instance, let's say you've | string** and the **hashes of the dependencies**. For instance, let's say you've | ||||||
| previously pushed a checkpoint to the remote, but now you've changed some | previously pushed a checkpoint to the remote, but now you've changed some | ||||||
|  |  | ||||||
|  | @ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which | ||||||
| come directly from | come directly from | ||||||
| [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): | [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): | ||||||
| 
 | 
 | ||||||
| | Symbol    | Description                                                                                                          | | | Symbol                                  | Description                                                                                                          | | ||||||
| | --------- | -------------------------------------------------------------------------------------------------------------------- | | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `A < B`   | `A` is the immediate dependent of `B`.                                                                               | | | `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               | | ||||||
| | `A > B`   | `A` is the immediate head of `B`.                                                                                    | | | `A > B`                                 | `A` is the immediate head of `B`.                                                                                    | | ||||||
| | `A << B`  | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | | `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | ||||||
| | `A >> B`  | `A` is the head in a chain to `B` following head → dep paths.                                                   | | | `A >> B`                                | `A` is the head in a chain to `B` following head → dep paths.                                                   | | ||||||
| | `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | | `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | ||||||
| | `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | | `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | ||||||
| | `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | | `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | ||||||
| | `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | | `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | ||||||
| | `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | | `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | ||||||
| | `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | | `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | ||||||
| | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | | `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | ||||||
| | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | | `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | ||||||
| | `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | | `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||||
| | `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | | `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||||
| | `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | | `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||||
| | `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | | `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||||
|  | | `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||||
|  | | `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||||
|  | | `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||||
|  | | `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||||
| 
 | 
 | ||||||
| ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -53,20 +53,22 @@ architectures and their arguments and hyperparameters. | ||||||
| > nlp.add_pipe("entity_linker", config=config) | > nlp.add_pipe("entity_linker", config=config) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Setting                                         | Description                                                                                                                                                                                                                                                                                 | | | Setting                                             | Description                                                                                                                                                                                                                                                                                                      | | ||||||
| | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `labels_discard`                                | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                              | | | `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   | | ||||||
| | `n_sents`                                       | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                           | | | `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                | | ||||||
| | `incl_prior`                                    | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                        | | | `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             | | ||||||
| | `incl_context`                                  | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                      | | | `incl_context`                                      | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                           | | ||||||
| | `model`                                         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                      | | | `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           | | ||||||
| | `entity_vector_length`                          | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                               | | | `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    | | ||||||
| | `use_gold_ents`                                 | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                        | | | `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             | | ||||||
| | `get_candidates`                                | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                    | | | `get_candidates`                                    | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~                                         | | ||||||
| | `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   | | | `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | | ||||||
| | `scorer` <Tag variant="new">3.2</Tag>           | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                     | | | `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           | | ||||||
|  | | `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         | | ||||||
|  | | `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          | | ||||||
| | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        | | | `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        | | ||||||
| | `threshold` <Tag variant="new">3.4</Tag>        | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | | | `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      | | ||||||
| 
 | 
 | ||||||
| ```python | ```python | ||||||
| %%GITHUB_SPACY/spacy/pipeline/entity_linker.py | %%GITHUB_SPACY/spacy/pipeline/entity_linker.py | ||||||
|  |  | ||||||
|  | @ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used. | ||||||
| > displacy.serve(doc, style="dep", options=options) | > displacy.serve(doc, style="dep", options=options) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name               | Description                                                                                                                                  | | | Name               | Description                                                                                                                                                                                                                                   | | ||||||
| | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             | | | `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~                                                                                                              | | ||||||
| | `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                      | | | `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                                                                                                                       | | ||||||
| | `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | | | `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~                                                                                                  | | ||||||
| | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             | | | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                                                                                                                              | | ||||||
| | `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    | | | `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     | | ||||||
| | `color`            | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                                                                       | | | `color`            | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       | | ||||||
| | `bg`               | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~                                                                 | | | `bg`               | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | | ||||||
| | `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                        | | | `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         | | ||||||
| | `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                             | | | `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                                                                                                                              | | ||||||
| | `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                          | | | `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                                                                                                                           | | ||||||
| | `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                 | | | `arrow_width`      | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~                                                                                                                                                  | | ||||||
| | `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                           | | | `arrow_spacing`    | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~                                                                                                                            | | ||||||
| | `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                     | | | `word_spacing`     | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~                                                                                                                                                                      | | ||||||
| | `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                           | | | `distance`         | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~                                                                                                                                            | | ||||||
| 
 | 
 | ||||||
| #### Named Entity Visualizer options {id="displacy_options-ent"} | #### Named Entity Visualizer options {id="displacy_options-ent"} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1100,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which | ||||||
| come directly from | come directly from | ||||||
| [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): | [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): | ||||||
| 
 | 
 | ||||||
| | Symbol    | Description                                                                                                          | | | Symbol                                  | Description                                                                                                          | | ||||||
| | --------- | -------------------------------------------------------------------------------------------------------------------- | | | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `A < B`   | `A` is the immediate dependent of `B`.                                                                               | | | `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               | | ||||||
| | `A > B`   | `A` is the immediate head of `B`.                                                                                    | | | `A > B`                                 | `A` is the immediate head of `B`.                                                                                    | | ||||||
| | `A << B`  | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | | `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              | | ||||||
| | `A >> B`  | `A` is the head in a chain to `B` following head → dep paths.                                                   | | | `A >> B`                                | `A` is the head in a chain to `B` following head → dep paths.                                                   | | ||||||
| | `A . B`   | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | | `A . B`                                 | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree.                   | | ||||||
| | `A .* B`  | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | | `A .* B`                                | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_.                 | | ||||||
| | `A ; B`   | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | | `A ; B`                                 | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | | ||||||
| | `A ;* B`  | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | | `A ;* B`                                | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_.                  | | ||||||
| | `A $+ B`  | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | | `A $+ B`                                | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`.                 | | ||||||
| | `A $- B`  | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | | `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  | | ||||||
| | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | | `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                | | ||||||
| | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | | `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 | | ||||||
|  | | `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||||
|  | | `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||||
|  | | `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||||
|  | | `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||||
|  | | `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          | | ||||||
|  | | `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           | | ||||||
|  | | `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         | | ||||||
|  | | `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          | | ||||||
| 
 | 
 | ||||||
| ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} | ||||||
| 
 | 
 | ||||||
|  | @ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline") | ||||||
| 
 | 
 | ||||||
| The saved pipeline now includes the `"entity_ruler"` in its | The saved pipeline now includes the `"entity_ruler"` in its | ||||||
| [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a | [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a | ||||||
| file `entityruler.jsonl` with the patterns. When you load the pipeline back in, | file `patterns.jsonl` with the patterns. When you load the pipeline back in, all | ||||||
| all pipeline components will be restored and deserialized – including the entity | pipeline components will be restored and deserialized – including the entity | ||||||
| ruler. This lets you ship powerful pipeline packages with binary weights _and_ | ruler. This lets you ship powerful pipeline packages with binary weights _and_ | ||||||
| rules included! | rules included! | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -58,12 +58,12 @@ arcs. | ||||||
| 
 | 
 | ||||||
| </Infobox> | </Infobox> | ||||||
| 
 | 
 | ||||||
| | Argument  | Description                                                                               | | | Argument  | Description                                                                                                                                                                                                                                   | | ||||||
| | --------- | ----------------------------------------------------------------------------------------- | | | --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     | | ||||||
| | `color`   | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                    | | | `color`   | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       | | ||||||
| | `bg`      | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~              | | | `bg`      | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | | ||||||
| | `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                     | | | `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         | | ||||||
| 
 | 
 | ||||||
| For a list of all available options, see the | For a list of all available options, see the | ||||||
| [`displacy` API documentation](/api/top-level#displacy_options). | [`displacy` API documentation](/api/top-level#displacy_options). | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user