mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'v4' into feature/docwise-generator-batching
# Conflicts: # spacy/pipeline/entity_linker.py # website/docs/api/entitylinker.mdx
This commit is contained in:
		
						commit
						f33f0ed160
					
				
							
								
								
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/autoblack.yml
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -16,7 +16,7 @@ jobs:
 | 
				
			||||||
        with:
 | 
					        with:
 | 
				
			||||||
            ref: ${{ github.head_ref }}
 | 
					            ref: ${{ github.head_ref }}
 | 
				
			||||||
      - uses: actions/setup-python@v4
 | 
					      - uses: actions/setup-python@v4
 | 
				
			||||||
      - run: pip install black
 | 
					      - run: pip install black -c requirements.txt
 | 
				
			||||||
      - name: Auto-format code if needed
 | 
					      - name: Auto-format code if needed
 | 
				
			||||||
        run: black spacy
 | 
					        run: black spacy
 | 
				
			||||||
      # We can't run black --check here because that returns a non-zero excit
 | 
					      # We can't run black --check here because that returns a non-zero excit
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
 | 
				
			||||||
Python modules. If you've built spaCy from source, you'll already have both
 | 
					Python modules. If you've built spaCy from source, you'll already have both
 | 
				
			||||||
tools installed.
 | 
					tools installed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					As a general rule of thumb, we use f-strings for any formatting of strings.
 | 
				
			||||||
 | 
					One exception are calls to Python's `logging` functionality.
 | 
				
			||||||
 | 
					To avoid unnecessary string conversions in these cases, we use string formatting
 | 
				
			||||||
 | 
					templates with `%s` and `%d` etc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
**⚠️ Note that formatting and linting is currently only possible for Python
 | 
					**⚠️ Note that formatting and linting is currently only possible for Python
 | 
				
			||||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
 | 
					modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -41,7 +41,7 @@ jobs:
 | 
				
			||||||
        inputs:
 | 
					        inputs:
 | 
				
			||||||
          versionSpec: "3.8"
 | 
					          versionSpec: "3.8"
 | 
				
			||||||
      - script: |
 | 
					      - script: |
 | 
				
			||||||
          pip install black==22.3.0
 | 
					          pip install black -c requirements.txt
 | 
				
			||||||
          python -m black spacy --check
 | 
					          python -m black spacy --check
 | 
				
			||||||
        displayName: "black"
 | 
					        displayName: "black"
 | 
				
			||||||
      - script: |
 | 
					      - script: |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -30,9 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0
 | 
				
			||||||
mock>=2.0.0,<3.0.0
 | 
					mock>=2.0.0,<3.0.0
 | 
				
			||||||
flake8>=3.8.0,<6.0.0
 | 
					flake8>=3.8.0,<6.0.0
 | 
				
			||||||
hypothesis>=3.27.0,<7.0.0
 | 
					hypothesis>=3.27.0,<7.0.0
 | 
				
			||||||
mypy>=0.990,<0.1000; platform_machine != "aarch64"
 | 
					mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
 | 
				
			||||||
 | 
					types-dataclasses>=0.1.3; python_version < "3.7"
 | 
				
			||||||
types-mock>=0.1.1
 | 
					types-mock>=0.1.1
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
types-requests
 | 
					types-requests
 | 
				
			||||||
types-setuptools>=57.0.0
 | 
					types-setuptools>=57.0.0
 | 
				
			||||||
black>=22.0,<23.0
 | 
					black==22.3.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -90,9 +90,9 @@ def parse_config_overrides(
 | 
				
			||||||
    cli_overrides = _parse_overrides(args, is_cli=True)
 | 
					    cli_overrides = _parse_overrides(args, is_cli=True)
 | 
				
			||||||
    if cli_overrides:
 | 
					    if cli_overrides:
 | 
				
			||||||
        keys = [k for k in cli_overrides if k not in env_overrides]
 | 
					        keys = [k for k in cli_overrides if k not in env_overrides]
 | 
				
			||||||
        logger.debug(f"Config overrides from CLI: {keys}")
 | 
					        logger.debug("Config overrides from CLI: %s", keys)
 | 
				
			||||||
    if env_overrides:
 | 
					    if env_overrides:
 | 
				
			||||||
        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
 | 
					        logger.debug("Config overrides from env variables: %s", list(env_overrides))
 | 
				
			||||||
    return {**cli_overrides, **env_overrides}
 | 
					    return {**cli_overrides, **env_overrides}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -252,7 +252,7 @@ def get_third_party_dependencies(
 | 
				
			||||||
                    raise regerr from None
 | 
					                    raise regerr from None
 | 
				
			||||||
            module_name = func_info.get("module")  # type: ignore[attr-defined]
 | 
					            module_name = func_info.get("module")  # type: ignore[attr-defined]
 | 
				
			||||||
            if module_name:  # the code is part of a module, not a --code file
 | 
					            if module_name:  # the code is part of a module, not a --code file
 | 
				
			||||||
                modules.add(func_info["module"].split(".")[0])  # type: ignore[index]
 | 
					                modules.add(func_info["module"].split(".")[0])  # type: ignore[union-attr]
 | 
				
			||||||
    dependencies = []
 | 
					    dependencies = []
 | 
				
			||||||
    for module_name in modules:
 | 
					    for module_name in modules:
 | 
				
			||||||
        if module_name in distributions:
 | 
					        if module_name in distributions:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
 | 
				
			||||||
    # in the list.
 | 
					    # in the list.
 | 
				
			||||||
    while commands:
 | 
					    while commands:
 | 
				
			||||||
        for i, cmd in enumerate(list(commands)):
 | 
					        for i, cmd in enumerate(list(commands)):
 | 
				
			||||||
            logger.debug(f"CMD: {cmd['name']}.")
 | 
					            logger.debug("CMD: %s.", cmd["name"])
 | 
				
			||||||
            deps = [project_dir / dep for dep in cmd.get("deps", [])]
 | 
					            deps = [project_dir / dep for dep in cmd.get("deps", [])]
 | 
				
			||||||
            if all(dep.exists() for dep in deps):
 | 
					            if all(dep.exists() for dep in deps):
 | 
				
			||||||
                cmd_hash = get_command_hash("", "", deps, cmd["script"])
 | 
					                cmd_hash = get_command_hash("", "", deps, cmd["script"])
 | 
				
			||||||
                for output_path in cmd.get("outputs", []):
 | 
					                for output_path in cmd.get("outputs", []):
 | 
				
			||||||
                    url = storage.pull(output_path, command_hash=cmd_hash)
 | 
					                    url = storage.pull(output_path, command_hash=cmd_hash)
 | 
				
			||||||
                    logger.debug(
 | 
					                    logger.debug(
 | 
				
			||||||
                        f"URL: {url} for {output_path} with command hash {cmd_hash}"
 | 
					                        "URL: %s for %s with command hash %s",
 | 
				
			||||||
 | 
					                        url,
 | 
				
			||||||
 | 
					                        output_path,
 | 
				
			||||||
 | 
					                        cmd_hash,
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                    yield url, output_path
 | 
					                    yield url, output_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
 | 
				
			||||||
                commands.pop(i)
 | 
					                commands.pop(i)
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
 | 
					                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            # If we didn't break the for loop, break the while loop.
 | 
					            # If we didn't break the for loop, break the while loop.
 | 
				
			||||||
            break
 | 
					            break
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
 | 
				
			||||||
        remote = config["remotes"][remote]
 | 
					        remote = config["remotes"][remote]
 | 
				
			||||||
    storage = RemoteStorage(project_dir, remote)
 | 
					    storage = RemoteStorage(project_dir, remote)
 | 
				
			||||||
    for cmd in config.get("commands", []):
 | 
					    for cmd in config.get("commands", []):
 | 
				
			||||||
        logger.debug(f"CMD: cmd['name']")
 | 
					        logger.debug("CMD: %s", cmd["name"])
 | 
				
			||||||
        deps = [project_dir / dep for dep in cmd.get("deps", [])]
 | 
					        deps = [project_dir / dep for dep in cmd.get("deps", [])]
 | 
				
			||||||
        if any(not dep.exists() for dep in deps):
 | 
					        if any(not dep.exists() for dep in deps):
 | 
				
			||||||
            logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
 | 
					            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        cmd_hash = get_command_hash(
 | 
					        cmd_hash = get_command_hash(
 | 
				
			||||||
            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
 | 
					            "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        logger.debug(f"CMD_HASH: {cmd_hash}")
 | 
					        logger.debug("CMD_HASH: %s", cmd_hash)
 | 
				
			||||||
        for output_path in cmd.get("outputs", []):
 | 
					        for output_path in cmd.get("outputs", []):
 | 
				
			||||||
            output_loc = project_dir / output_path
 | 
					            output_loc = project_dir / output_path
 | 
				
			||||||
            if output_loc.exists() and _is_not_empty_dir(output_loc):
 | 
					            if output_loc.exists() and _is_not_empty_dir(output_loc):
 | 
				
			||||||
| 
						 | 
					@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
 | 
				
			||||||
                    content_hash=get_content_hash(output_loc),
 | 
					                    content_hash=get_content_hash(output_loc),
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                logger.debug(
 | 
					                logger.debug(
 | 
				
			||||||
                    f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
 | 
					                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                yield output_path, url
 | 
					                yield output_path, url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -437,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
 | 
					    E133 = ("The sum of prior probabilities for alias '{alias}' should not "
 | 
				
			||||||
            "exceed 1, but found {sum}.")
 | 
					            "exceed 1, but found {sum}.")
 | 
				
			||||||
    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
					    E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
 | 
				
			||||||
    E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
 | 
					    E139 = ("Knowledge base for component '{name}' is empty.")
 | 
				
			||||||
            "`kb.add_entity` and `kb.add_alias` to add entries.")
 | 
					 | 
				
			||||||
    E140 = ("The list of entities, prior probabilities and entity vectors "
 | 
					    E140 = ("The list of entities, prior probabilities and entity vectors "
 | 
				
			||||||
            "should be of equal length.")
 | 
					            "should be of equal length.")
 | 
				
			||||||
    E141 = ("Entity vectors should be of length {required} instead of the "
 | 
					    E141 = ("Entity vectors should be of length {required} instead of the "
 | 
				
			||||||
| 
						 | 
					@ -951,7 +950,7 @@ class Errors(metaclass=ErrorsWithCodes):
 | 
				
			||||||
    E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
 | 
					    E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
 | 
				
			||||||
             "with `displacy.serve(doc, port=port)`")
 | 
					             "with `displacy.serve(doc, port=port)`")
 | 
				
			||||||
    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
 | 
					    E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
 | 
				
			||||||
             "or use `auto_switch_port=True` to pick an available port automatically.")
 | 
					             "or use `auto_select_port=True` to pick an available port automatically.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # v4 error strings
 | 
					    # v4 error strings
 | 
				
			||||||
    E4000 = ("Expected a Doc as input, but got: '{type}'")
 | 
					    E4000 = ("Expected a Doc as input, but got: '{type}'")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
 | 
				
			||||||
        self._alias_index = PreshMap(nr_aliases + 1)
 | 
					        self._alias_index = PreshMap(nr_aliases + 1)
 | 
				
			||||||
        self._aliases_table = alias_vec(nr_aliases + 1)
 | 
					        self._aliases_table = alias_vec(nr_aliases + 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def is_empty(self):
 | 
				
			||||||
 | 
					        return len(self) == 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        return self.get_size_entities()
 | 
					        return self.get_size_entities()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from ...language import Language, BaseDefaults
 | 
					from ...language import Language, BaseDefaults
 | 
				
			||||||
from ...pipeline import Lemmatizer
 | 
					from ...pipeline import Lemmatizer
 | 
				
			||||||
 | 
					from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					 | 
				
			||||||
# Punctuation stolen from Danish
 | 
					 | 
				
			||||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class SwedishDefaults(BaseDefaults):
 | 
					class SwedishDefaults(BaseDefaults):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/lang/sv/punctuation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,33 @@
 | 
				
			||||||
 | 
					from ..char_classes import LIST_ELLIPSES, LIST_ICONS
 | 
				
			||||||
 | 
					from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 | 
				
			||||||
 | 
					from ..punctuation import TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_infixes = (
 | 
				
			||||||
 | 
					    LIST_ELLIPSES
 | 
				
			||||||
 | 
					    + LIST_ICONS
 | 
				
			||||||
 | 
					    + [
 | 
				
			||||||
 | 
					        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
 | 
				
			||||||
 | 
					        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
 | 
				
			||||||
 | 
					        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_suffixes = [
 | 
				
			||||||
 | 
					    suffix
 | 
				
			||||||
 | 
					    for suffix in TOKENIZER_SUFFIXES
 | 
				
			||||||
 | 
					    if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					_suffixes += [r"(?<=[^sSxXzZ])\'"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TOKENIZER_INFIXES = _infixes
 | 
				
			||||||
 | 
					TOKENIZER_SUFFIXES = _suffixes
 | 
				
			||||||
| 
						 | 
					@ -106,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.misc("spacy.LookupsDataLoader.v1")
 | 
					@registry.misc("spacy.LookupsDataLoader.v1")
 | 
				
			||||||
def load_lookups_data(lang, tables):
 | 
					def load_lookups_data(lang, tables):
 | 
				
			||||||
    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
 | 
					    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
 | 
				
			||||||
    lookups = load_lookups(lang=lang, tables=tables)
 | 
					    lookups = load_lookups(lang=lang, tables=tables)
 | 
				
			||||||
    return lookups
 | 
					    return lookups
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2072,7 +2072,7 @@ class Language:
 | 
				
			||||||
        pipe = self.get_pipe(pipe_name)
 | 
					        pipe = self.get_pipe(pipe_name)
 | 
				
			||||||
        pipe_cfg = self._pipe_configs[pipe_name]
 | 
					        pipe_cfg = self._pipe_configs[pipe_name]
 | 
				
			||||||
        if listeners:
 | 
					        if listeners:
 | 
				
			||||||
            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
 | 
					            util.logger.debug("Replacing listeners of component '%s'", pipe_name)
 | 
				
			||||||
            if len(list(listeners)) != len(pipe_listeners):
 | 
					            if len(list(listeners)) != len(pipe_listeners):
 | 
				
			||||||
                # The number of listeners defined in the component model doesn't
 | 
					                # The number of listeners defined in the component model doesn't
 | 
				
			||||||
                # match the listeners to replace, so we won't be able to update
 | 
					                # match the listeners to replace, so we won't be able to update
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -82,8 +82,12 @@ cdef class DependencyMatcher:
 | 
				
			||||||
            "$-": self._imm_left_sib,
 | 
					            "$-": self._imm_left_sib,
 | 
				
			||||||
            "$++": self._right_sib,
 | 
					            "$++": self._right_sib,
 | 
				
			||||||
            "$--": self._left_sib,
 | 
					            "$--": self._left_sib,
 | 
				
			||||||
 | 
					            ">+": self._imm_right_child,
 | 
				
			||||||
 | 
					            ">-": self._imm_left_child,
 | 
				
			||||||
            ">++": self._right_child,
 | 
					            ">++": self._right_child,
 | 
				
			||||||
            ">--": self._left_child,
 | 
					            ">--": self._left_child,
 | 
				
			||||||
 | 
					            "<+": self._imm_right_parent,
 | 
				
			||||||
 | 
					            "<-": self._imm_left_parent,
 | 
				
			||||||
            "<++": self._right_parent,
 | 
					            "<++": self._right_parent,
 | 
				
			||||||
            "<--": self._left_parent,
 | 
					            "<--": self._left_parent,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					@ -427,12 +431,34 @@ cdef class DependencyMatcher:
 | 
				
			||||||
    def _left_sib(self, doc, node):
 | 
					    def _left_sib(self, doc, node):
 | 
				
			||||||
        return [doc[child.i] for child in doc[node].head.children if child.i < node]
 | 
					        return [doc[child.i] for child in doc[node].head.children if child.i < node]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _imm_right_child(self, doc, node):
 | 
				
			||||||
 | 
					        for child in doc[node].children:
 | 
				
			||||||
 | 
					            if child.i == node + 1:
 | 
				
			||||||
 | 
					                return [doc[child.i]]
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _imm_left_child(self, doc, node):
 | 
				
			||||||
 | 
					        for child in doc[node].children:
 | 
				
			||||||
 | 
					            if child.i == node - 1:
 | 
				
			||||||
 | 
					                return [doc[child.i]]
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _right_child(self, doc, node):
 | 
					    def _right_child(self, doc, node):
 | 
				
			||||||
        return [doc[child.i] for child in doc[node].children if child.i > node]
 | 
					        return [doc[child.i] for child in doc[node].children if child.i > node]
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    def _left_child(self, doc, node):
 | 
					    def _left_child(self, doc, node):
 | 
				
			||||||
        return [doc[child.i] for child in doc[node].children if child.i < node]
 | 
					        return [doc[child.i] for child in doc[node].children if child.i < node]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _imm_right_parent(self, doc, node):
 | 
				
			||||||
 | 
					        if doc[node].head.i == node + 1:
 | 
				
			||||||
 | 
					            return [doc[node].head]
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _imm_left_parent(self, doc, node):
 | 
				
			||||||
 | 
					        if doc[node].head.i == node - 1:
 | 
				
			||||||
 | 
					            return [doc[node].head]
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _right_parent(self, doc, node):
 | 
					    def _right_parent(self, doc, node):
 | 
				
			||||||
        if doc[node].head.i > node:
 | 
					        if doc[node].head.i > node:
 | 
				
			||||||
            return [doc[node].head]
 | 
					            return [doc[node].head]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -829,6 +829,11 @@ def _get_attr_values(spec, string_store):
 | 
				
			||||||
    return attr_values
 | 
					    return attr_values
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
 | 
				
			||||||
 | 
					    # tuple order affects performance
 | 
				
			||||||
 | 
					    return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
 | 
					# These predicate helper classes are used to match the REGEX, IN, >= etc
 | 
				
			||||||
# extensions to the matcher introduced in #3173.
 | 
					# extensions to the matcher introduced in #3173.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -848,7 +853,7 @@ class _FuzzyPredicate:
 | 
				
			||||||
        fuzz = self.predicate[len("FUZZY"):] # number after prefix
 | 
					        fuzz = self.predicate[len("FUZZY"):] # number after prefix
 | 
				
			||||||
        self.fuzzy = int(fuzz) if fuzz else -1
 | 
					        self.fuzzy = int(fuzz) if fuzz else -1
 | 
				
			||||||
        self.fuzzy_compare = fuzzy_compare
 | 
					        self.fuzzy_compare = fuzzy_compare
 | 
				
			||||||
        self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
 | 
					        self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Token token):
 | 
					    def __call__(self, Token token):
 | 
				
			||||||
        if self.is_extension:
 | 
					        if self.is_extension:
 | 
				
			||||||
| 
						 | 
					@ -870,7 +875,7 @@ class _RegexPredicate:
 | 
				
			||||||
        self.value = re.compile(value)
 | 
					        self.value = re.compile(value)
 | 
				
			||||||
        self.predicate = predicate
 | 
					        self.predicate = predicate
 | 
				
			||||||
        self.is_extension = is_extension
 | 
					        self.is_extension = is_extension
 | 
				
			||||||
        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
 | 
					        self.key = _predicate_cache_key(self.attr, self.predicate, value)
 | 
				
			||||||
        if self.predicate not in self.operators:
 | 
					        if self.predicate not in self.operators:
 | 
				
			||||||
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 | 
					            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -906,7 +911,7 @@ class _SetPredicate:
 | 
				
			||||||
                self.value = set(get_string_id(v) for v in value)
 | 
					                self.value = set(get_string_id(v) for v in value)
 | 
				
			||||||
        self.predicate = predicate
 | 
					        self.predicate = predicate
 | 
				
			||||||
        self.is_extension = is_extension
 | 
					        self.is_extension = is_extension
 | 
				
			||||||
        self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
 | 
					        self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
 | 
				
			||||||
        if self.predicate not in self.operators:
 | 
					        if self.predicate not in self.operators:
 | 
				
			||||||
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 | 
					            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -978,7 +983,7 @@ class _ComparisonPredicate:
 | 
				
			||||||
        self.value = value
 | 
					        self.value = value
 | 
				
			||||||
        self.predicate = predicate
 | 
					        self.predicate = predicate
 | 
				
			||||||
        self.is_extension = is_extension
 | 
					        self.is_extension = is_extension
 | 
				
			||||||
        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
 | 
					        self.key = _predicate_cache_key(self.attr, self.predicate, value)
 | 
				
			||||||
        if self.predicate not in self.operators:
 | 
					        if self.predicate not in self.operators:
 | 
				
			||||||
            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 | 
					            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1093,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
 | 
				
			||||||
        if isinstance(value, dict):
 | 
					        if isinstance(value, dict):
 | 
				
			||||||
            for type_, cls in predicate_types.items():
 | 
					            for type_, cls in predicate_types.items():
 | 
				
			||||||
                if type_ in value:
 | 
					                if type_ in value:
 | 
				
			||||||
                    key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
 | 
					                    key = _predicate_cache_key(attr, type_, value[type_])
 | 
				
			||||||
                    if key in seen_predicates:
 | 
					                    if key in seen_predicates:
 | 
				
			||||||
                        output.append(seen_predicates[key])
 | 
					                        output.append(seen_predicates[key])
 | 
				
			||||||
                    else:
 | 
					                    else:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,6 +89,14 @@ def load_kb(
 | 
				
			||||||
    return kb_from_file
 | 
					    return kb_from_file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.misc("spacy.EmptyKB.v2")
 | 
				
			||||||
 | 
					def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
 | 
				
			||||||
 | 
					    def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
 | 
				
			||||||
 | 
					        return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return empty_kb_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.misc("spacy.EmptyKB.v1")
 | 
					@registry.misc("spacy.EmptyKB.v1")
 | 
				
			||||||
def empty_kb(
 | 
					def empty_kb(
 | 
				
			||||||
    entity_vector_length: int,
 | 
					    entity_vector_length: int,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -57,6 +57,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
 | 
				
			||||||
        "entity_vector_length": 64,
 | 
					        "entity_vector_length": 64,
 | 
				
			||||||
        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
 | 
					        "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
 | 
				
			||||||
        "overwrite": False,
 | 
					        "overwrite": False,
 | 
				
			||||||
 | 
					        "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
 | 
				
			||||||
        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
 | 
					        "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
 | 
				
			||||||
        "use_gold_ents": True,
 | 
					        "use_gold_ents": True,
 | 
				
			||||||
        "threshold": None,
 | 
					        "threshold": None,
 | 
				
			||||||
| 
						 | 
					@ -79,6 +80,7 @@ def make_entity_linker(
 | 
				
			||||||
    incl_context: bool,
 | 
					    incl_context: bool,
 | 
				
			||||||
    entity_vector_length: int,
 | 
					    entity_vector_length: int,
 | 
				
			||||||
    get_candidates: Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]],
 | 
					    get_candidates: Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]],
 | 
				
			||||||
 | 
					    generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
 | 
				
			||||||
    overwrite: bool,
 | 
					    overwrite: bool,
 | 
				
			||||||
    scorer: Optional[Callable],
 | 
					    scorer: Optional[Callable],
 | 
				
			||||||
    use_gold_ents: bool,
 | 
					    use_gold_ents: bool,
 | 
				
			||||||
| 
						 | 
					@ -98,6 +100,7 @@ def make_entity_linker(
 | 
				
			||||||
    get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
 | 
					    get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
 | 
				
			||||||
        Function producing a list of candidates per document, given a certain knowledge base and several textual
 | 
					        Function producing a list of candidates per document, given a certain knowledge base and several textual
 | 
				
			||||||
        documents with textual mentions.
 | 
					        documents with textual mentions.
 | 
				
			||||||
 | 
					    generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
 | 
				
			||||||
    scorer (Optional[Callable]): The scoring method.
 | 
					    scorer (Optional[Callable]): The scoring method.
 | 
				
			||||||
    use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
 | 
					    use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
 | 
				
			||||||
        component must provide entity annotations.
 | 
					        component must provide entity annotations.
 | 
				
			||||||
| 
						 | 
					@ -137,6 +140,7 @@ def make_entity_linker(
 | 
				
			||||||
        incl_context=incl_context,
 | 
					        incl_context=incl_context,
 | 
				
			||||||
        entity_vector_length=entity_vector_length,
 | 
					        entity_vector_length=entity_vector_length,
 | 
				
			||||||
        get_candidates=get_candidates,
 | 
					        get_candidates=get_candidates,
 | 
				
			||||||
 | 
					        generate_empty_kb=generate_empty_kb,
 | 
				
			||||||
        overwrite=overwrite,
 | 
					        overwrite=overwrite,
 | 
				
			||||||
        scorer=scorer,
 | 
					        scorer=scorer,
 | 
				
			||||||
        use_gold_ents=use_gold_ents,
 | 
					        use_gold_ents=use_gold_ents,
 | 
				
			||||||
| 
						 | 
					@ -174,6 +178,7 @@ class EntityLinker(TrainablePipe):
 | 
				
			||||||
        incl_context: bool,
 | 
					        incl_context: bool,
 | 
				
			||||||
        entity_vector_length: int,
 | 
					        entity_vector_length: int,
 | 
				
			||||||
        get_candidates: Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]],
 | 
					        get_candidates: Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]],
 | 
				
			||||||
 | 
					        generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
 | 
				
			||||||
        overwrite: bool = False,
 | 
					        overwrite: bool = False,
 | 
				
			||||||
        scorer: Optional[Callable] = entity_linker_score,
 | 
					        scorer: Optional[Callable] = entity_linker_score,
 | 
				
			||||||
        use_gold_ents: bool,
 | 
					        use_gold_ents: bool,
 | 
				
			||||||
| 
						 | 
					@ -194,12 +199,14 @@ class EntityLinker(TrainablePipe):
 | 
				
			||||||
        get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
 | 
					        get_candidates (Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]):
 | 
				
			||||||
            Function producing a list of candidates per document, given a certain knowledge base and several textual
 | 
					            Function producing a list of candidates per document, given a certain knowledge base and several textual
 | 
				
			||||||
            documents with textual mentions.
 | 
					            documents with textual mentions.
 | 
				
			||||||
 | 
					        generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
 | 
				
			||||||
        overwrite (bool): Whether to overwrite existing non-empty annotations.
 | 
					        overwrite (bool): Whether to overwrite existing non-empty annotations.
 | 
				
			||||||
        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
 | 
					        scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
 | 
				
			||||||
        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
 | 
					        use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
 | 
				
			||||||
            component must provide entity annotations.
 | 
					            component must provide entity annotations.
 | 
				
			||||||
        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
 | 
					        threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
 | 
				
			||||||
            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
 | 
					            threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
 | 
				
			||||||
 | 
					        save_activations (bool): save model activations in Doc when annotating.
 | 
				
			||||||
        DOCS: https://spacy.io/api/entitylinker#init
 | 
					        DOCS: https://spacy.io/api/entitylinker#init
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -216,15 +223,14 @@ class EntityLinker(TrainablePipe):
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
        self.name = name
 | 
					        self.name = name
 | 
				
			||||||
        self.labels_discard = list(labels_discard)
 | 
					        self.labels_discard = list(labels_discard)
 | 
				
			||||||
 | 
					        # how many neighbour sentences to take into account
 | 
				
			||||||
        self.n_sents = n_sents
 | 
					        self.n_sents = n_sents
 | 
				
			||||||
        self.incl_prior = incl_prior
 | 
					        self.incl_prior = incl_prior
 | 
				
			||||||
        self.incl_context = incl_context
 | 
					        self.incl_context = incl_context
 | 
				
			||||||
        self.get_candidates = get_candidates
 | 
					        self.get_candidates = get_candidates
 | 
				
			||||||
        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
 | 
					        self.cfg: Dict[str, Any] = {"overwrite": overwrite}
 | 
				
			||||||
        self.distance = CosineDistance(normalize=False)
 | 
					        self.distance = CosineDistance(normalize=False)
 | 
				
			||||||
        # how many neighbour sentences to take into account
 | 
					        self.kb = generate_empty_kb(self.vocab, entity_vector_length)
 | 
				
			||||||
        # create an empty KB by default
 | 
					 | 
				
			||||||
        self.kb = empty_kb(entity_vector_length)(self.vocab)
 | 
					 | 
				
			||||||
        self.scorer = scorer
 | 
					        self.scorer = scorer
 | 
				
			||||||
        self.use_gold_ents = use_gold_ents
 | 
					        self.use_gold_ents = use_gold_ents
 | 
				
			||||||
        self.threshold = threshold
 | 
					        self.threshold = threshold
 | 
				
			||||||
| 
						 | 
					@ -242,7 +248,7 @@ class EntityLinker(TrainablePipe):
 | 
				
			||||||
        # Raise an error if the knowledge base is not initialized.
 | 
					        # Raise an error if the knowledge base is not initialized.
 | 
				
			||||||
        if self.kb is None:
 | 
					        if self.kb is None:
 | 
				
			||||||
            raise ValueError(Errors.E1018.format(name=self.name))
 | 
					            raise ValueError(Errors.E1018.format(name=self.name))
 | 
				
			||||||
        if len(self.kb) == 0:
 | 
					        if hasattr(self.kb, "is_empty") and self.kb.is_empty():
 | 
				
			||||||
            raise ValueError(Errors.E139.format(name=self.name))
 | 
					            raise ValueError(Errors.E139.format(name=self.name))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def initialize(
 | 
					    def initialize(
 | 
				
			||||||
| 
						 | 
					@ -306,7 +312,6 @@ class EntityLinker(TrainablePipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        If one isn't present, then the update step needs to be skipped.
 | 
					        If one isn't present, then the update step needs to be skipped.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        # todo continue here: fix get_candidates_call
 | 
					 | 
				
			||||||
        for candidates_for_doc in self.get_candidates(
 | 
					        for candidates_for_doc in self.get_candidates(
 | 
				
			||||||
            self.kb, (SpanGroup(doc=eg.predicted, spans=eg.predicted.ents) for eg in examples)
 | 
					            self.kb, (SpanGroup(doc=eg.predicted, spans=eg.predicted.ents) for eg in examples)
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
 | 
				
			||||||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
 | 
					def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
 | 
				
			||||||
    tokens = sv_tokenizer(text)
 | 
					    tokens = sv_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.issue(12311)
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
 | 
				
			||||||
 | 
					def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
 | 
				
			||||||
 | 
					    tokens = sv_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
 | 
				
			||||||
        ("the", "brown", "$--", 0),
 | 
					        ("the", "brown", "$--", 0),
 | 
				
			||||||
        ("brown", "the", "$--", 1),
 | 
					        ("brown", "the", "$--", 1),
 | 
				
			||||||
        ("brown", "brown", "$--", 0),
 | 
					        ("brown", "brown", "$--", 0),
 | 
				
			||||||
 | 
					        ("over", "jumped", "<+", 0),
 | 
				
			||||||
 | 
					        ("quick", "fox", "<+", 0),
 | 
				
			||||||
 | 
					        ("the", "quick", "<+", 0),
 | 
				
			||||||
 | 
					        ("brown", "fox", "<+", 1),
 | 
				
			||||||
        ("quick", "fox", "<++", 1),
 | 
					        ("quick", "fox", "<++", 1),
 | 
				
			||||||
        ("quick", "over", "<++", 0),
 | 
					        ("quick", "over", "<++", 0),
 | 
				
			||||||
        ("over", "jumped", "<++", 0),
 | 
					        ("over", "jumped", "<++", 0),
 | 
				
			||||||
        ("the", "fox", "<++", 2),
 | 
					        ("the", "fox", "<++", 2),
 | 
				
			||||||
 | 
					        ("brown", "fox", "<-", 0),
 | 
				
			||||||
 | 
					        ("fox", "over", "<-", 0),
 | 
				
			||||||
 | 
					        ("the", "over", "<-", 0),
 | 
				
			||||||
 | 
					        ("over", "jumped", "<-", 1),
 | 
				
			||||||
        ("brown", "fox", "<--", 0),
 | 
					        ("brown", "fox", "<--", 0),
 | 
				
			||||||
        ("fox", "jumped", "<--", 0),
 | 
					        ("fox", "jumped", "<--", 0),
 | 
				
			||||||
        ("fox", "over", "<--", 1),
 | 
					        ("fox", "over", "<--", 1),
 | 
				
			||||||
 | 
					        ("fox", "brown", ">+", 0),
 | 
				
			||||||
 | 
					        ("over", "fox", ">+", 0),
 | 
				
			||||||
 | 
					        ("over", "the", ">+", 0),
 | 
				
			||||||
 | 
					        ("jumped", "over", ">+", 1),
 | 
				
			||||||
        ("jumped", "over", ">++", 1),
 | 
					        ("jumped", "over", ">++", 1),
 | 
				
			||||||
        ("fox", "lazy", ">++", 0),
 | 
					        ("fox", "lazy", ">++", 0),
 | 
				
			||||||
        ("over", "the", ">++", 0),
 | 
					        ("over", "the", ">++", 0),
 | 
				
			||||||
 | 
					        ("jumped", "over", ">-", 0),
 | 
				
			||||||
 | 
					        ("fox", "quick", ">-", 0),
 | 
				
			||||||
 | 
					        ("brown", "quick", ">-", 0),
 | 
				
			||||||
 | 
					        ("fox", "brown", ">-", 1),
 | 
				
			||||||
        ("brown", "fox", ">--", 0),
 | 
					        ("brown", "fox", ">--", 0),
 | 
				
			||||||
        ("fox", "brown", ">--", 1),
 | 
					        ("fox", "brown", ">--", 1),
 | 
				
			||||||
        ("jumped", "fox", ">--", 1),
 | 
					        ("jumped", "fox", ">--", 1),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -353,6 +353,9 @@ def test_kb_default(nlp):
 | 
				
			||||||
    """Test that the default (empty) KB is loaded upon construction"""
 | 
					    """Test that the default (empty) KB is loaded upon construction"""
 | 
				
			||||||
    entity_linker = nlp.add_pipe("entity_linker", config={})
 | 
					    entity_linker = nlp.add_pipe("entity_linker", config={})
 | 
				
			||||||
    assert len(entity_linker.kb) == 0
 | 
					    assert len(entity_linker.kb) == 0
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError, match="E139"):
 | 
				
			||||||
 | 
					        # this raises an error because the KB is empty
 | 
				
			||||||
 | 
					        entity_linker.validate_kb()
 | 
				
			||||||
    assert entity_linker.kb.get_size_entities() == 0
 | 
					    assert entity_linker.kb.get_size_entities() == 0
 | 
				
			||||||
    assert entity_linker.kb.get_size_aliases() == 0
 | 
					    assert entity_linker.kb.get_size_aliases() == 0
 | 
				
			||||||
    # 64 is the default value from pipeline.entity_linker
 | 
					    # 64 is the default value from pipeline.entity_linker
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,10 @@
 | 
				
			||||||
from typing import Callable
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from typing import Callable, Iterable, Any, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import util
 | 
					import srsly
 | 
				
			||||||
from spacy.util import ensure_path, registry, load_model_from_config
 | 
					
 | 
				
			||||||
 | 
					from spacy import util, Errors
 | 
				
			||||||
 | 
					from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
 | 
				
			||||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
 | 
					from spacy.kb.kb_in_memory import InMemoryLookupKB
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
from thinc.api import Config
 | 
					from thinc.api import Config
 | 
				
			||||||
| 
						 | 
					@ -92,6 +95,9 @@ def test_serialize_subclassed_kb():
 | 
				
			||||||
    [components.entity_linker]
 | 
					    [components.entity_linker]
 | 
				
			||||||
    factory = "entity_linker"
 | 
					    factory = "entity_linker"
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    [components.entity_linker.generate_empty_kb]
 | 
				
			||||||
 | 
					    @misc = "kb_test.CustomEmptyKB.v1"
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    [initialize]
 | 
					    [initialize]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    [initialize.components]
 | 
					    [initialize.components]
 | 
				
			||||||
| 
						 | 
					@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
 | 
				
			||||||
    [initialize.components.entity_linker]
 | 
					    [initialize.components.entity_linker]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    [initialize.components.entity_linker.kb_loader]
 | 
					    [initialize.components.entity_linker.kb_loader]
 | 
				
			||||||
    @misc = "spacy.CustomKB.v1"
 | 
					    @misc = "kb_test.CustomKB.v1"
 | 
				
			||||||
    entity_vector_length = 342
 | 
					    entity_vector_length = 342
 | 
				
			||||||
    custom_field = 666
 | 
					    custom_field = 666
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
 | 
				
			||||||
            super().__init__(vocab, entity_vector_length)
 | 
					            super().__init__(vocab, entity_vector_length)
 | 
				
			||||||
            self.custom_field = custom_field
 | 
					            self.custom_field = custom_field
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @registry.misc("spacy.CustomKB.v1")
 | 
					        def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
 | 
				
			||||||
 | 
					            """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
 | 
				
			||||||
 | 
					            path = ensure_path(path)
 | 
				
			||||||
 | 
					            if not path.exists():
 | 
				
			||||||
 | 
					                path.mkdir(parents=True)
 | 
				
			||||||
 | 
					            if not path.is_dir():
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E928.format(loc=path))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            def serialize_custom_fields(file_path: Path) -> None:
 | 
				
			||||||
 | 
					                srsly.write_json(file_path, {"custom_field": self.custom_field})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            serialize = {
 | 
				
			||||||
 | 
					                "contents": lambda p: self.write_contents(p),
 | 
				
			||||||
 | 
					                "strings.json": lambda p: self.vocab.strings.to_disk(p),
 | 
				
			||||||
 | 
					                "custom_fields": lambda p: serialize_custom_fields(p),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            util.to_disk(path, serialize, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
 | 
				
			||||||
 | 
					            """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
 | 
				
			||||||
 | 
					            path = ensure_path(path)
 | 
				
			||||||
 | 
					            if not path.exists():
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E929.format(loc=path))
 | 
				
			||||||
 | 
					            if not path.is_dir():
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E928.format(loc=path))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            def deserialize_custom_fields(file_path: Path) -> None:
 | 
				
			||||||
 | 
					                self.custom_field = srsly.read_json(file_path)["custom_field"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            deserialize: Dict[str, Callable[[Any], Any]] = {
 | 
				
			||||||
 | 
					                "contents": lambda p: self.read_contents(p),
 | 
				
			||||||
 | 
					                "strings.json": lambda p: self.vocab.strings.from_disk(p),
 | 
				
			||||||
 | 
					                "custom_fields": lambda p: deserialize_custom_fields(p),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            util.from_disk(path, deserialize, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @registry.misc("kb_test.CustomEmptyKB.v1")
 | 
				
			||||||
 | 
					    def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
 | 
				
			||||||
 | 
					        def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
 | 
				
			||||||
 | 
					            return SubInMemoryLookupKB(
 | 
				
			||||||
 | 
					                vocab=vocab,
 | 
				
			||||||
 | 
					                entity_vector_length=entity_vector_length,
 | 
				
			||||||
 | 
					                custom_field=0,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return empty_kb_factory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @registry.misc("kb_test.CustomKB.v1")
 | 
				
			||||||
    def custom_kb(
 | 
					    def custom_kb(
 | 
				
			||||||
        entity_vector_length: int, custom_field: int
 | 
					        entity_vector_length: int, custom_field: int
 | 
				
			||||||
    ) -> Callable[[Vocab], InMemoryLookupKB]:
 | 
					    ) -> Callable[[Vocab], SubInMemoryLookupKB]:
 | 
				
			||||||
        def custom_kb_factory(vocab):
 | 
					        def custom_kb_factory(vocab):
 | 
				
			||||||
            kb = SubInMemoryLookupKB(
 | 
					            kb = SubInMemoryLookupKB(
 | 
				
			||||||
                vocab=vocab,
 | 
					                vocab=vocab,
 | 
				
			||||||
| 
						 | 
					@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
 | 
				
			||||||
        nlp2 = util.load_model_from_path(tmp_dir)
 | 
					        nlp2 = util.load_model_from_path(tmp_dir)
 | 
				
			||||||
        entity_linker2 = nlp2.get_pipe("entity_linker")
 | 
					        entity_linker2 = nlp2.get_pipe("entity_linker")
 | 
				
			||||||
        # After IO, the KB is the standard one
 | 
					        # After IO, the KB is the standard one
 | 
				
			||||||
        assert type(entity_linker2.kb) == InMemoryLookupKB
 | 
					        assert type(entity_linker2.kb) == SubInMemoryLookupKB
 | 
				
			||||||
        assert entity_linker2.kb.entity_vector_length == 342
 | 
					        assert entity_linker2.kb.entity_vector_length == 342
 | 
				
			||||||
        assert not hasattr(entity_linker2.kb, "custom_field")
 | 
					        assert entity_linker2.kb.custom_field == 666
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,7 @@
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
from typer.testing import CliRunner
 | 
					from typer.testing import CliRunner
 | 
				
			||||||
from spacy.tokens import DocBin, Doc
 | 
					from spacy.tokens import DocBin, Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -89,3 +91,138 @@ def test_debug_data_trainable_lemmatizer_cli(en_vocab):
 | 
				
			||||||
        # Instead of checking specific wording of the output, which may change,
 | 
					        # Instead of checking specific wording of the output, which may change,
 | 
				
			||||||
        # we'll check that this section of the debug output is present.
 | 
					        # we'll check that this section of the debug output is present.
 | 
				
			||||||
        assert "= Trainable Lemmatizer =" in result_debug_data.stdout
 | 
					        assert "= Trainable Lemmatizer =" in result_debug_data.stdout
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# project tests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SAMPLE_PROJECT = {
 | 
				
			||||||
 | 
					    "title": "Sample project",
 | 
				
			||||||
 | 
					    "description": "This is a project for testing",
 | 
				
			||||||
 | 
					    "assets": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "dest": "assets/spacy-readme.md",
 | 
				
			||||||
 | 
					            "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
 | 
				
			||||||
 | 
					            "checksum": "411b2c89ccf34288fae8ed126bf652f7",
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "dest": "assets/citation.cff",
 | 
				
			||||||
 | 
					            "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
 | 
				
			||||||
 | 
					            "checksum": "c996bfd80202d480eb2e592369714e5e",
 | 
				
			||||||
 | 
					            "extra": True,
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "commands": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "name": "ok",
 | 
				
			||||||
 | 
					            "help": "print ok",
 | 
				
			||||||
 | 
					            "script": ["python -c \"print('okokok')\""],
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "name": "create",
 | 
				
			||||||
 | 
					            "help": "make a file",
 | 
				
			||||||
 | 
					            "script": ["touch abc.txt"],
 | 
				
			||||||
 | 
					            "outputs": ["abc.txt"],
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            "name": "clean",
 | 
				
			||||||
 | 
					            "help": "remove test file",
 | 
				
			||||||
 | 
					            "script": ["rm abc.txt"],
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def project_dir():
 | 
				
			||||||
 | 
					    with make_tempdir() as pdir:
 | 
				
			||||||
 | 
					        (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
 | 
				
			||||||
 | 
					        yield pdir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_project_document(project_dir):
 | 
				
			||||||
 | 
					    readme_path = project_dir / "README.md"
 | 
				
			||||||
 | 
					    assert not readme_path.exists(), "README already exists"
 | 
				
			||||||
 | 
					    result = CliRunner().invoke(
 | 
				
			||||||
 | 
					        app, ["project", "document", str(project_dir), "-o", str(readme_path)]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert result.exit_code == 0
 | 
				
			||||||
 | 
					    assert readme_path.is_file()
 | 
				
			||||||
 | 
					    text = readme_path.read_text("utf-8")
 | 
				
			||||||
 | 
					    assert SAMPLE_PROJECT["description"] in text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_project_assets(project_dir):
 | 
				
			||||||
 | 
					    asset_dir = project_dir / "assets"
 | 
				
			||||||
 | 
					    assert not asset_dir.exists(), "Assets dir is already present"
 | 
				
			||||||
 | 
					    result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
 | 
				
			||||||
 | 
					    assert result.exit_code == 0
 | 
				
			||||||
 | 
					    assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
 | 
				
			||||||
 | 
					    # check that extras work
 | 
				
			||||||
 | 
					    result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
 | 
				
			||||||
 | 
					    assert result.exit_code == 0
 | 
				
			||||||
 | 
					    assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_project_run(project_dir):
 | 
				
			||||||
 | 
					    # make sure dry run works
 | 
				
			||||||
 | 
					    test_file = project_dir / "abc.txt"
 | 
				
			||||||
 | 
					    result = CliRunner().invoke(
 | 
				
			||||||
 | 
					        app, ["project", "run", "--dry", "create", str(project_dir)]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert result.exit_code == 0
 | 
				
			||||||
 | 
					    assert not test_file.is_file()
 | 
				
			||||||
 | 
					    result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
 | 
				
			||||||
 | 
					    assert result.exit_code == 0
 | 
				
			||||||
 | 
					    assert test_file.is_file()
 | 
				
			||||||
 | 
					    result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
 | 
				
			||||||
 | 
					    assert result.exit_code == 0
 | 
				
			||||||
 | 
					    assert "okokok" in result.stdout
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "options",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        "",
 | 
				
			||||||
 | 
					        # "--sparse",
 | 
				
			||||||
 | 
					        "--branch v3",
 | 
				
			||||||
 | 
					        "--repo https://github.com/explosion/projects --branch v3",
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_project_clone(options):
 | 
				
			||||||
 | 
					    with make_tempdir() as workspace:
 | 
				
			||||||
 | 
					        out = workspace / "project"
 | 
				
			||||||
 | 
					        target = "benchmarks/ner_conll03"
 | 
				
			||||||
 | 
					        if not options:
 | 
				
			||||||
 | 
					            options = []
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            options = options.split()
 | 
				
			||||||
 | 
					        result = CliRunner().invoke(
 | 
				
			||||||
 | 
					            app, ["project", "clone", target, *options, str(out)]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        assert result.exit_code == 0
 | 
				
			||||||
 | 
					        assert (out / "README.md").is_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_project_push_pull(project_dir):
 | 
				
			||||||
 | 
					    proj = dict(SAMPLE_PROJECT)
 | 
				
			||||||
 | 
					    remote = "xyz"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with make_tempdir() as remote_dir:
 | 
				
			||||||
 | 
					        proj["remotes"] = {remote: str(remote_dir)}
 | 
				
			||||||
 | 
					        proj_text = srsly.yaml_dumps(proj)
 | 
				
			||||||
 | 
					        (project_dir / "project.yml").write_text(proj_text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        test_file = project_dir / "abc.txt"
 | 
				
			||||||
 | 
					        result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
 | 
				
			||||||
 | 
					        assert result.exit_code == 0
 | 
				
			||||||
 | 
					        assert test_file.is_file()
 | 
				
			||||||
 | 
					        result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
 | 
				
			||||||
 | 
					        assert result.exit_code == 0
 | 
				
			||||||
 | 
					        result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
 | 
				
			||||||
 | 
					        assert result.exit_code == 0
 | 
				
			||||||
 | 
					        assert not test_file.exists()
 | 
				
			||||||
 | 
					        result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
 | 
				
			||||||
 | 
					        assert result.exit_code == 0
 | 
				
			||||||
 | 
					        assert test_file.is_file()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -98,7 +98,7 @@ def assert_sents_error(doc):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def warn_error(proc_name, proc, docs, e):
 | 
					def warn_error(proc_name, proc, docs, e):
 | 
				
			||||||
    logger = logging.getLogger("spacy")
 | 
					    logger = logging.getLogger("spacy")
 | 
				
			||||||
    logger.warning(f"Trouble with component {proc_name}.")
 | 
					    logger.warning("Trouble with component %s.", proc_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -131,9 +131,9 @@ class Doc:
 | 
				
			||||||
        default: str = ...,
 | 
					        default: str = ...,
 | 
				
			||||||
    ) -> None: ...
 | 
					    ) -> None: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def noun_chunks(self) -> Iterator[Span]: ...
 | 
					    def noun_chunks(self) -> Tuple[Span]: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def sents(self) -> Iterator[Span]: ...
 | 
					    def sents(self) -> Tuple[Span]: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def lang(self) -> int: ...
 | 
					    def lang(self) -> int: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -703,10 +703,10 @@ cdef class Doc:
 | 
				
			||||||
        return self.text
 | 
					        return self.text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property ents:
 | 
					    property ents:
 | 
				
			||||||
        """The named entities in the document. Returns a tuple of named entity
 | 
					        """The named entities in the document. Returns a list of named entity
 | 
				
			||||||
        `Span` objects, if the entity recognizer has been applied.
 | 
					        `Span` objects, if the entity recognizer has been applied.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (tuple): Entities in the document, one `Span` per entity.
 | 
					        RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#ents
 | 
					        DOCS: https://spacy.io/api/doc#ents
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -864,7 +864,7 @@ cdef class Doc:
 | 
				
			||||||
        NP-level coordination, no prepositional phrases, and no relative
 | 
					        NP-level coordination, no prepositional phrases, and no relative
 | 
				
			||||||
        clauses.
 | 
					        clauses.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        YIELDS (Span): Noun chunks in the document.
 | 
					        RETURNS (Tuple[Span]): Noun chunks in the document.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#noun_chunks
 | 
					        DOCS: https://spacy.io/api/doc#noun_chunks
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -873,36 +873,35 @@ cdef class Doc:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Accumulate the result before beginning to iterate over it. This
 | 
					        # Accumulate the result before beginning to iterate over it. This
 | 
				
			||||||
        # prevents the tokenization from being changed out from under us
 | 
					        # prevents the tokenization from being changed out from under us
 | 
				
			||||||
        # during the iteration. The tricky thing here is that Span accepts
 | 
					        # during the iteration.
 | 
				
			||||||
        # its tokenization changing, so it's okay once we have the Span
 | 
					 | 
				
			||||||
        # objects. See Issue #375.
 | 
					 | 
				
			||||||
        spans = []
 | 
					        spans = []
 | 
				
			||||||
        for start, end, label in self.noun_chunks_iterator(self):
 | 
					        for start, end, label in self.noun_chunks_iterator(self):
 | 
				
			||||||
            spans.append(Span(self, start, end, label=label))
 | 
					            spans.append(Span(self, start, end, label=label))
 | 
				
			||||||
        for span in spans:
 | 
					        return tuple(spans)
 | 
				
			||||||
            yield span
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def sents(self):
 | 
					    def sents(self):
 | 
				
			||||||
        """Iterate over the sentences in the document. Yields sentence `Span`
 | 
					        """Iterate over the sentences in the document. Yields sentence `Span`
 | 
				
			||||||
        objects. Sentence spans have no label.
 | 
					        objects. Sentence spans have no label.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        YIELDS (Span): Sentences in the document.
 | 
					        RETURNS (Tuple[Span]): Sentences in the document.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/doc#sents
 | 
					        DOCS: https://spacy.io/api/doc#sents
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if not self.has_annotation("SENT_START"):
 | 
					        if not self.has_annotation("SENT_START"):
 | 
				
			||||||
            raise ValueError(Errors.E030)
 | 
					            raise ValueError(Errors.E030)
 | 
				
			||||||
        if "sents" in self.user_hooks:
 | 
					        if "sents" in self.user_hooks:
 | 
				
			||||||
            yield from self.user_hooks["sents"](self)
 | 
					            return tuple(self.user_hooks["sents"](self))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            start = 0
 | 
					            start = 0
 | 
				
			||||||
 | 
					            spans = []
 | 
				
			||||||
            for i in range(1, self.length):
 | 
					            for i in range(1, self.length):
 | 
				
			||||||
                if self.c[i].sent_start == 1:
 | 
					                if self.c[i].sent_start == 1:
 | 
				
			||||||
                    yield Span(self, start, i)
 | 
					                    spans.append(Span(self, start, i))
 | 
				
			||||||
                    start = i
 | 
					                    start = i
 | 
				
			||||||
            if start != self.length:
 | 
					            if start != self.length:
 | 
				
			||||||
                yield Span(self, start, self.length)
 | 
					                spans.append(Span(self, start, self.length))
 | 
				
			||||||
 | 
					            return tuple(spans)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def lang(self):
 | 
					    def lang(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -74,6 +74,8 @@ class Span:
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def ents(self) -> Tuple[Span]: ...
 | 
					    def ents(self) -> Tuple[Span]: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
 | 
					    def sents(self) -> Tuple[Span]: ...
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
    def has_vector(self) -> bool: ...
 | 
					    def has_vector(self) -> bool: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def vector(self) -> Floats1d: ...
 | 
					    def vector(self) -> Floats1d: ...
 | 
				
			||||||
| 
						 | 
					@ -86,7 +88,7 @@ class Span:
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def text_with_ws(self) -> str: ...
 | 
					    def text_with_ws(self) -> str: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def noun_chunks(self) -> Iterator[Span]: ...
 | 
					    def noun_chunks(self) -> Tuple[Span]: ...
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def root(self) -> Token: ...
 | 
					    def root(self) -> Token: ...
 | 
				
			||||||
    def char_span(
 | 
					    def char_span(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -461,7 +461,7 @@ cdef class Span:
 | 
				
			||||||
        """Obtain the sentences that contain this span. If the given span
 | 
					        """Obtain the sentences that contain this span. If the given span
 | 
				
			||||||
        crosses sentence boundaries, return all sentences it is a part of.
 | 
					        crosses sentence boundaries, return all sentences it is a part of.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (Iterable[Span]): All sentences that the span is a part of.
 | 
					        RETURNS (Tuple[Span]): All sentences that the span is a part of.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/span#sents
 | 
					        DOCS: https://spacy.io/api/span#sents
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -469,12 +469,13 @@ cdef class Span:
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if "sents" in self.doc.user_span_hooks:
 | 
					        if "sents" in self.doc.user_span_hooks:
 | 
				
			||||||
            yield from self.doc.user_span_hooks["sents"](self)
 | 
					            return tuple(self.doc.user_span_hooks["sents"](self))
 | 
				
			||||||
        elif "sents" in self.doc.user_hooks:
 | 
					        spans = []
 | 
				
			||||||
 | 
					        if "sents" in self.doc.user_hooks:
 | 
				
			||||||
            for sentence in self.doc.user_hooks["sents"](self.doc):
 | 
					            for sentence in self.doc.user_hooks["sents"](self.doc):
 | 
				
			||||||
                if sentence.end > self.start:
 | 
					                if sentence.end > self.start:
 | 
				
			||||||
                    if sentence.start < self.end or sentence.start == self.start == self.end:
 | 
					                    if sentence.start < self.end or sentence.start == self.start == self.end:
 | 
				
			||||||
                        yield sentence
 | 
					                        spans.append(sentence)
 | 
				
			||||||
                    else:
 | 
					                    else:
 | 
				
			||||||
                        break
 | 
					                        break
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -489,12 +490,13 @@ cdef class Span:
 | 
				
			||||||
            # Now, find all the sentences in the span
 | 
					            # Now, find all the sentences in the span
 | 
				
			||||||
            for i in range(start + 1, self.doc.length):
 | 
					            for i in range(start + 1, self.doc.length):
 | 
				
			||||||
                if self.doc.c[i].sent_start == 1:
 | 
					                if self.doc.c[i].sent_start == 1:
 | 
				
			||||||
                    yield Span(self.doc, start, i)
 | 
					                    spans.append(Span(self.doc, start, i))
 | 
				
			||||||
                    start = i
 | 
					                    start = i
 | 
				
			||||||
                    if start >= self.end:
 | 
					                    if start >= self.end:
 | 
				
			||||||
                        break
 | 
					                        break
 | 
				
			||||||
            if start < self.end:
 | 
					            if start < self.end:
 | 
				
			||||||
                yield Span(self.doc, start, self.end)
 | 
					                spans.append(Span(self.doc, start, self.end))
 | 
				
			||||||
 | 
					        return tuple(spans)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
| 
						 | 
					@ -502,7 +504,7 @@ cdef class Span:
 | 
				
			||||||
        """The named entities that fall completely within the span. Returns
 | 
					        """The named entities that fall completely within the span. Returns
 | 
				
			||||||
        a tuple of `Span` objects.
 | 
					        a tuple of `Span` objects.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RETURNS (tuple): Entities in the span, one `Span` per entity.
 | 
					        RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/span#ents
 | 
					        DOCS: https://spacy.io/api/span#ents
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
| 
						 | 
					@ -517,7 +519,7 @@ cdef class Span:
 | 
				
			||||||
                    ents.append(ent)
 | 
					                    ents.append(ent)
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
        return ents
 | 
					        return tuple(ents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def has_vector(self):
 | 
					    def has_vector(self):
 | 
				
			||||||
| 
						 | 
					@ -613,13 +615,15 @@ cdef class Span:
 | 
				
			||||||
        NP-level coordination, no prepositional phrases, and no relative
 | 
					        NP-level coordination, no prepositional phrases, and no relative
 | 
				
			||||||
        clauses.
 | 
					        clauses.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        YIELDS (Span): Noun chunks in the span.
 | 
					        RETURNS (Tuple[Span]): Noun chunks in the span.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/span#noun_chunks
 | 
					        DOCS: https://spacy.io/api/span#noun_chunks
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        spans = []
 | 
				
			||||||
        for span in self.doc.noun_chunks:
 | 
					        for span in self.doc.noun_chunks:
 | 
				
			||||||
            if span.start >= self.start and span.end <= self.end:
 | 
					            if span.start >= self.start and span.end <= self.end:
 | 
				
			||||||
                yield span
 | 
					                spans.append(span)
 | 
				
			||||||
 | 
					        return tuple(spans)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def root(self):
 | 
					    def root(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,7 +11,7 @@ def create_copy_from_base_model(
 | 
				
			||||||
) -> Callable[[Language], Language]:
 | 
					) -> Callable[[Language], Language]:
 | 
				
			||||||
    def copy_from_base_model(nlp):
 | 
					    def copy_from_base_model(nlp):
 | 
				
			||||||
        if tokenizer:
 | 
					        if tokenizer:
 | 
				
			||||||
            logger.info(f"Copying tokenizer from: {tokenizer}")
 | 
					            logger.info("Copying tokenizer from: %s", tokenizer)
 | 
				
			||||||
            base_nlp = load_model(tokenizer)
 | 
					            base_nlp = load_model(tokenizer)
 | 
				
			||||||
            if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
 | 
					            if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
 | 
				
			||||||
                nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
 | 
					                nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
 | 
				
			||||||
| 
						 | 
					@ -23,7 +23,7 @@ def create_copy_from_base_model(
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
        if vocab:
 | 
					        if vocab:
 | 
				
			||||||
            logger.info(f"Copying vocab from: {vocab}")
 | 
					            logger.info("Copying vocab from: %s", vocab)
 | 
				
			||||||
            # only reload if the vocab is from a different model
 | 
					            # only reload if the vocab is from a different model
 | 
				
			||||||
            if tokenizer != vocab:
 | 
					            if tokenizer != vocab:
 | 
				
			||||||
                base_nlp = load_model(vocab)
 | 
					                base_nlp = load_model(vocab)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -29,7 +29,7 @@ def create_docbin_reader(
 | 
				
			||||||
) -> Callable[["Language"], Iterable[Example]]:
 | 
					) -> Callable[["Language"], Iterable[Example]]:
 | 
				
			||||||
    if path is None:
 | 
					    if path is None:
 | 
				
			||||||
        raise ValueError(Errors.E913)
 | 
					        raise ValueError(Errors.E913)
 | 
				
			||||||
    util.logger.debug(f"Loading corpus from path: {path}")
 | 
					    util.logger.debug("Loading corpus from path: %s", path)
 | 
				
			||||||
    return Corpus(
 | 
					    return Corpus(
 | 
				
			||||||
        path,
 | 
					        path,
 | 
				
			||||||
        gold_preproc=gold_preproc,
 | 
					        gold_preproc=gold_preproc,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
 | 
				
			||||||
    frozen_components = T["frozen_components"]
 | 
					    frozen_components = T["frozen_components"]
 | 
				
			||||||
    # Sourced components that require resume_training
 | 
					    # Sourced components that require resume_training
 | 
				
			||||||
    resume_components = [p for p in sourced if p not in frozen_components]
 | 
					    resume_components = [p for p in sourced if p not in frozen_components]
 | 
				
			||||||
    logger.info(f"Pipeline: {nlp.pipe_names}")
 | 
					    logger.info("Pipeline: %s", nlp.pipe_names)
 | 
				
			||||||
    if resume_components:
 | 
					    if resume_components:
 | 
				
			||||||
        with nlp.select_pipes(enable=resume_components):
 | 
					        with nlp.select_pipes(enable=resume_components):
 | 
				
			||||||
            logger.info(f"Resuming training for: {resume_components}")
 | 
					            logger.info("Resuming training for: %s", resume_components)
 | 
				
			||||||
            nlp.resume_training(sgd=optimizer)
 | 
					            nlp.resume_training(sgd=optimizer)
 | 
				
			||||||
    # Make sure that listeners are defined before initializing further
 | 
					    # Make sure that listeners are defined before initializing further
 | 
				
			||||||
    nlp._link_components()
 | 
					    nlp._link_components()
 | 
				
			||||||
| 
						 | 
					@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
 | 
				
			||||||
        if T["max_epochs"] == -1:
 | 
					        if T["max_epochs"] == -1:
 | 
				
			||||||
            sample_size = 100
 | 
					            sample_size = 100
 | 
				
			||||||
            logger.debug(
 | 
					            logger.debug(
 | 
				
			||||||
                f"Due to streamed train corpus, using only first {sample_size} "
 | 
					                "Due to streamed train corpus, using only first %s examples for initialization. "
 | 
				
			||||||
                f"examples for initialization. If necessary, provide all labels "
 | 
					                "If necessary, provide all labels in [initialize]. "
 | 
				
			||||||
                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
 | 
					                "More info: https://spacy.io/api/cli#init_labels",
 | 
				
			||||||
 | 
					                sample_size,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            nlp.initialize(
 | 
					            nlp.initialize(
 | 
				
			||||||
                lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
 | 
					                lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
 | 
					            nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
 | 
				
			||||||
        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
 | 
					        logger.info("Initialized pipeline components: %s", nlp.pipe_names)
 | 
				
			||||||
    # Detect components with listeners that are not frozen consistently
 | 
					    # Detect components with listeners that are not frozen consistently
 | 
				
			||||||
    for name, proc in nlp.pipeline:
 | 
					    for name, proc in nlp.pipeline:
 | 
				
			||||||
        for listener in getattr(
 | 
					        for listener in getattr(
 | 
				
			||||||
| 
						 | 
					@ -109,7 +110,7 @@ def init_vocab(
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    if lookups:
 | 
					    if lookups:
 | 
				
			||||||
        nlp.vocab.lookups = lookups
 | 
					        nlp.vocab.lookups = lookups
 | 
				
			||||||
        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
 | 
					        logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
 | 
				
			||||||
    data_path = ensure_path(data)
 | 
					    data_path = ensure_path(data)
 | 
				
			||||||
    if data_path is not None:
 | 
					    if data_path is not None:
 | 
				
			||||||
        lex_attrs = srsly.read_jsonl(data_path)
 | 
					        lex_attrs = srsly.read_jsonl(data_path)
 | 
				
			||||||
| 
						 | 
					@ -125,11 +126,11 @@ def init_vocab(
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            oov_prob = DEFAULT_OOV_PROB
 | 
					            oov_prob = DEFAULT_OOV_PROB
 | 
				
			||||||
        nlp.vocab.cfg.update({"oov_prob": oov_prob})
 | 
					        nlp.vocab.cfg.update({"oov_prob": oov_prob})
 | 
				
			||||||
        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
 | 
					        logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
 | 
				
			||||||
    logger.info("Created vocabulary")
 | 
					    logger.info("Created vocabulary")
 | 
				
			||||||
    if vectors is not None:
 | 
					    if vectors is not None:
 | 
				
			||||||
        load_vectors_into_model(nlp, vectors)
 | 
					        load_vectors_into_model(nlp, vectors)
 | 
				
			||||||
        logger.info(f"Added vectors: {vectors}")
 | 
					        logger.info("Added vectors: %s", vectors)
 | 
				
			||||||
    # warn if source model vectors are not identical
 | 
					    # warn if source model vectors are not identical
 | 
				
			||||||
    sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
 | 
					    sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
 | 
				
			||||||
    vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
 | 
					    vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
 | 
				
			||||||
| 
						 | 
					@ -191,7 +192,7 @@ def init_tok2vec(
 | 
				
			||||||
    if weights_data is not None:
 | 
					    if weights_data is not None:
 | 
				
			||||||
        layer = get_tok2vec_ref(nlp, P)
 | 
					        layer = get_tok2vec_ref(nlp, P)
 | 
				
			||||||
        layer.from_bytes(weights_data)
 | 
					        layer.from_bytes(weights_data)
 | 
				
			||||||
        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
 | 
					        logger.info("Loaded pretrained weights from %s", init_tok2vec)
 | 
				
			||||||
        return True
 | 
					        return True
 | 
				
			||||||
    return False
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -215,13 +216,13 @@ def convert_vectors(
 | 
				
			||||||
        nlp.vocab.deduplicate_vectors()
 | 
					        nlp.vocab.deduplicate_vectors()
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        if vectors_loc:
 | 
					        if vectors_loc:
 | 
				
			||||||
            logger.info(f"Reading vectors from {vectors_loc}")
 | 
					            logger.info("Reading vectors from %s", vectors_loc)
 | 
				
			||||||
            vectors_data, vector_keys, floret_settings = read_vectors(
 | 
					            vectors_data, vector_keys, floret_settings = read_vectors(
 | 
				
			||||||
                vectors_loc,
 | 
					                vectors_loc,
 | 
				
			||||||
                truncate,
 | 
					                truncate,
 | 
				
			||||||
                mode=mode,
 | 
					                mode=mode,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            logger.info(f"Loaded vectors from {vectors_loc}")
 | 
					            logger.info("Loaded vectors from %s", vectors_loc)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            vectors_data, vector_keys = (None, None)
 | 
					            vectors_data, vector_keys = (None, None)
 | 
				
			||||||
        if vector_keys is not None and mode != VectorsMode.floret:
 | 
					        if vector_keys is not None and mode != VectorsMode.floret:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -371,6 +371,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
 | 
				
			||||||
            if subdir.exists():
 | 
					            if subdir.exists():
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
                    shutil.rmtree(str(subdir))
 | 
					                    shutil.rmtree(str(subdir))
 | 
				
			||||||
                    logger.debug(f"Removed existing output directory: {subdir}")
 | 
					                    logger.debug("Removed existing output directory: %s", subdir)
 | 
				
			||||||
                except Exception as e:
 | 
					                except Exception as e:
 | 
				
			||||||
                    raise IOError(Errors.E901.format(path=path)) from e
 | 
					                    raise IOError(Errors.E901.format(path=path)) from e
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,6 +33,7 @@ import inspect
 | 
				
			||||||
import pkgutil
 | 
					import pkgutil
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import socket
 | 
					import socket
 | 
				
			||||||
 | 
					import stat
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
    import cupy.random
 | 
					    import cupy.random
 | 
				
			||||||
| 
						 | 
					@ -55,7 +56,7 @@ if TYPE_CHECKING:
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
 | 
					OOV_RANK = numpy.iinfo(numpy.uint64).max
 | 
				
			||||||
DEFAULT_OOV_PROB = -20
 | 
					DEFAULT_OOV_PROB = -20
 | 
				
			||||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 | 
					LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Default order of sections in the config file. Not all sections needs to exist,
 | 
					# Default order of sections in the config file. Not all sections needs to exist,
 | 
				
			||||||
# and additional sections are added at the end, in alphabetical order.
 | 
					# and additional sections are added at the end, in alphabetical order.
 | 
				
			||||||
| 
						 | 
					@ -139,8 +140,17 @@ class registry(thinc.registry):
 | 
				
			||||||
        return func
 | 
					        return func
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def find(cls, registry_name: str, func_name: str) -> Callable:
 | 
					    def find(
 | 
				
			||||||
        """Get info about a registered function from the registry."""
 | 
					        cls, registry_name: str, func_name: str
 | 
				
			||||||
 | 
					    ) -> Dict[str, Optional[Union[str, int]]]:
 | 
				
			||||||
 | 
					        """Find information about a registered function, including the
 | 
				
			||||||
 | 
					        module and path to the file it's defined in, the line number and the
 | 
				
			||||||
 | 
					        docstring, if available.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        registry_name (str): Name of the catalogue registry.
 | 
				
			||||||
 | 
					        func_name (str): Name of the registered function.
 | 
				
			||||||
 | 
					        RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        # We're overwriting this classmethod so we're able to provide more
 | 
					        # We're overwriting this classmethod so we're able to provide more
 | 
				
			||||||
        # specific error messages and implement a fallback to spacy-legacy.
 | 
					        # specific error messages and implement a fallback to spacy-legacy.
 | 
				
			||||||
        if not hasattr(cls, registry_name):
 | 
					        if not hasattr(cls, registry_name):
 | 
				
			||||||
| 
						 | 
					@ -1028,11 +1038,19 @@ def make_tempdir() -> Generator[Path, None, None]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    YIELDS (Path): The path of the temp directory.
 | 
					    YIELDS (Path): The path of the temp directory.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    d = Path(tempfile.mkdtemp())
 | 
				
			||||||
 | 
					    yield d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # On Windows, git clones use read-only files, which cause permission errors
 | 
				
			||||||
 | 
					    # when being deleted. This forcibly fixes permissions.
 | 
				
			||||||
 | 
					    def force_remove(rmfunc, path, ex):
 | 
				
			||||||
 | 
					        os.chmod(path, stat.S_IWRITE)
 | 
				
			||||||
 | 
					        rmfunc(path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        with tempfile.TemporaryDirectory() as td:
 | 
					        shutil.rmtree(str(d), onerror=force_remove)
 | 
				
			||||||
            yield Path(td)
 | 
					 | 
				
			||||||
    except PermissionError as e:
 | 
					    except PermissionError as e:
 | 
				
			||||||
        warnings.warn(Warnings.W091.format(dir=td, msg=e))
 | 
					        warnings.warn(Warnings.W091.format(dir=d, msg=e))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_cwd(path: Union[Path, str]) -> bool:
 | 
					def is_cwd(path: Union[Path, str]) -> bool:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -897,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
 | 
				
			||||||
| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
					| `nO`        | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
 | 
				
			||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 | 
					| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                                                    |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
 | 
					### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
 | 
					A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
 | 
				
			||||||
instance. This is the default when a new entity linker component is created.
 | 
					instance.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                   | Description                                                                         |
 | 
					| Name                   | Description                                                                         |
 | 
				
			||||||
| ---------------------- | ----------------------------------------------------------------------------------- |
 | 
					| ---------------------- | ----------------------------------------------------------------------------------- |
 | 
				
			||||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 | 
					| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.EmptyKB.v2 {id="EmptyKB"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
 | 
				
			||||||
 | 
					instance. This is the default when a new entity linker component is created. It
 | 
				
			||||||
 | 
					returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
 | 
					### spacy.KBFromFile.v1 {id="KBFromFile"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
A function that reads an existing `KnowledgeBase` from file.
 | 
					A function that reads an existing `KnowledgeBase` from file.
 | 
				
			||||||
| 
						 | 
					@ -922,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default
 | 
				
			||||||
`CandidateGenerator` uses the text of a mention to find its potential aliases in
 | 
					`CandidateGenerator` uses the text of a mention to find its potential aliases in
 | 
				
			||||||
the `KnowledgeBase`. Note that this function is case-dependent.
 | 
					the `KnowledgeBase`. Note that this function is case-dependent.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of
 | 
				
			||||||
 | 
					[`Span`](/api/span) objects denoting named entities, and returns a list of
 | 
				
			||||||
 | 
					plausible [`Candidate`](/api/kb/#candidate) objects per specified
 | 
				
			||||||
 | 
					[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a
 | 
				
			||||||
 | 
					mention to find its potential aliases in the `KnowledgeBase`. Note that this
 | 
				
			||||||
 | 
					function is case-dependent.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Coreference {id="coref-architectures",tag="experimental"}
 | 
					## Coreference {id="coref-architectures",tag="experimental"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
 | 
					A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir]
 | 
				
			||||||
### project pull {id="project-pull",tag="command"}
 | 
					### project pull {id="project-pull",tag="command"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Download all files or directories listed as `outputs` for commands, unless they
 | 
					Download all files or directories listed as `outputs` for commands, unless they
 | 
				
			||||||
are not already present locally. When searching for files in the remote, `pull`
 | 
					are already present locally. When searching for files in the remote, `pull`
 | 
				
			||||||
won't just look at the output path, but will also consider the **command
 | 
					won't just look at the output path, but will also consider the **command
 | 
				
			||||||
string** and the **hashes of the dependencies**. For instance, let's say you've
 | 
					string** and the **hashes of the dependencies**. For instance, let's say you've
 | 
				
			||||||
previously pushed a checkpoint to the remote, but now you've changed some
 | 
					previously pushed a checkpoint to the remote, but now you've changed some
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +69,7 @@ come directly from
 | 
				
			||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 | 
					[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Symbol                                  | Description                                                                                                          |
 | 
					| Symbol                                  | Description                                                                                                          |
 | 
				
			||||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
 | 
					| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
 | 
				
			||||||
| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
 | 
					| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
 | 
				
			||||||
| `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              |
 | 
					| `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              |
 | 
				
			||||||
| 
						 | 
					@ -82,8 +82,12 @@ come directly from
 | 
				
			||||||
| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
 | 
					| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
 | 
				
			||||||
| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
 | 
					| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
 | 
				
			||||||
| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
 | 
					| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
 | 
				
			||||||
 | 
					| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | 
				
			||||||
 | 
					| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | 
				
			||||||
| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | 
					| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | 
				
			||||||
| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | 
					| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | 
				
			||||||
 | 
					| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | 
				
			||||||
 | 
					| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | 
				
			||||||
| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | 
					| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | 
				
			||||||
| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | 
					| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -654,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 | 
					## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Iterate over the base noun phrases in the document. Yields base noun-phrase
 | 
					Returns a tuple of the base noun phrases in the doc, if the document has been
 | 
				
			||||||
`Span` objects, if the document has been syntactically parsed. A base noun
 | 
					syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
 | 
				
			||||||
phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
 | 
					does not permit other NPs to be nested within it – so no NP-level coordination,
 | 
				
			||||||
nested within it – so no NP-level coordination, no prepositional phrases, and no
 | 
					no prepositional phrases, and no relative clauses.
 | 
				
			||||||
relative clauses.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
To customize the noun chunk iterator in a loaded pipeline, modify
 | 
					To customize the noun chunk iterator in a loaded pipeline, modify
 | 
				
			||||||
[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
 | 
					[`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk`
 | 
				
			||||||
| 
						 | 
					@ -676,12 +675,12 @@ implemented for the given language, a `NotImplementedError` is raised.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                                  |
 | 
					| Name        | Description                                  |
 | 
				
			||||||
| ---------- | ------------------------------------- |
 | 
					| ----------- | -------------------------------------------- |
 | 
				
			||||||
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
 | 
					| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Doc.sents {id="sents",tag="property",model="sentences"}
 | 
					## Doc.sents {id="sents",tag="property",model="sentences"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Iterate over the sentences in the document. Sentence spans have no label.
 | 
					Returns a tuple of the sentences in the document. Sentence spans have no label.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This property is only available when
 | 
					This property is only available when
 | 
				
			||||||
[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
 | 
					[sentence boundaries](/usage/linguistic-features#sbd) have been set on the
 | 
				
			||||||
| 
						 | 
					@ -698,8 +697,8 @@ will raise an error otherwise.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                                |
 | 
					| Name        | Description                                |
 | 
				
			||||||
| ---------- | ----------------------------------- |
 | 
					| ----------- | ------------------------------------------ |
 | 
				
			||||||
| **YIELDS** | Sentences in the document. ~~Span~~ |
 | 
					| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 | 
					## Doc.has_vector {id="has_vector",tag="property",model="vectors"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -54,7 +54,7 @@ architectures and their arguments and hyperparameters.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
 | 
					| Setting                                             | Description                                                                                                                                                                                                                                                                                                      |
 | 
				
			||||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
 | 
					| `labels_discard`                                    | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~                                                                                                                                                                                                                   |
 | 
				
			||||||
| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
 | 
					| `n_sents`                                           | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~                                                                                                                                                                                                                                |
 | 
				
			||||||
| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
 | 
					| `incl_prior`                                        | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~                                                                                                                                                                                                             |
 | 
				
			||||||
| 
						 | 
					@ -62,8 +62,9 @@ architectures and their arguments and hyperparameters.
 | 
				
			||||||
| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
 | 
					| `model`                                             | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~                                                                                                                                                           |
 | 
				
			||||||
| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
 | 
					| `entity_vector_length`                              | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~                                                                                                                                                                                                                                                    |
 | 
				
			||||||
| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
 | 
					| `use_gold_ents`                                     | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~                                                                                                                             |
 | 
				
			||||||
| `get_candidates`                                | Function that retrieves plausible candidates per entity mention in a given `SpanGroup`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~                                |
 | 
					| `get_candidates`                                | Function that retrieves plausible candidates per entity mention in a given `Iterable[SpanGroup]`. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator). ~~Callable[[KnowledgeBase, Iterator[SpanGroup]], Iterator[Iterable[Iterable[Candidate]]]]~~                                |
 | 
				
			||||||
| `overwrite` <Tag variant="new">3.2</Tag>        | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~                                                                                                                                                                                                                   |
 | 
					| `generate_empty_kb` <Tag variant="new">3.6</Tag>    | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~                                                                           |
 | 
				
			||||||
 | 
					| `overwrite` <Tag variant="new">3.2</Tag>            | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~                                                                                                                                                                                                                                         |
 | 
				
			||||||
| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
 | 
					| `scorer` <Tag variant="new">3.2</Tag>               | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~                                                                                                                                                                                                          |
 | 
				
			||||||
| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | 
					| `save_activations` <Tag variant="new">4.0</Tag> | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~                                                                                                                                                                        |
 | 
				
			||||||
| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
 | 
					| `threshold` <Tag variant="new">3.4</Tag>            | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~                      |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -276,16 +276,15 @@ The named entities that fall completely within the span. Returns a tuple of
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                                                  |
 | 
					| Name        | Description                                                  |
 | 
				
			||||||
| ----------- | ----------------------------------------------------------------- |
 | 
					| ----------- | ------------------------------------------------------------ |
 | 
				
			||||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
 | 
					| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 | 
					## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
 | 
					Returns a tuple of the base noun phrases in the span if the document has been
 | 
				
			||||||
objects, if the document has been syntactically parsed. A base noun phrase, or
 | 
					syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that
 | 
				
			||||||
"NP chunk", is a noun phrase that does not permit other NPs to be nested within
 | 
					does not permit other NPs to be nested within it – so no NP-level coordination,
 | 
				
			||||||
it – so no NP-level coordination, no prepositional phrases, and no relative
 | 
					no prepositional phrases, and no relative clauses.
 | 
				
			||||||
clauses.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 | 
					If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
 | 
				
			||||||
has not been implemeted for the given language, a `NotImplementedError` is
 | 
					has not been implemeted for the given language, a `NotImplementedError` is
 | 
				
			||||||
| 
						 | 
					@ -302,8 +301,8 @@ raised.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                              |
 | 
					| Name        | Description                              |
 | 
				
			||||||
| ---------- | --------------------------------- |
 | 
					| ----------- | ---------------------------------------- |
 | 
				
			||||||
| **YIELDS** | Noun chunks in the span. ~~Span~~ |
 | 
					| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Span.as_doc {id="as_doc",tag="method"}
 | 
					## Span.as_doc {id="as_doc",tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -525,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
 | 
					## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Returns a generator over the sentences the span belongs to. This property is
 | 
					Returns a tuple of the sentences the span belongs to. This property is only
 | 
				
			||||||
only available when [sentence boundaries](/usage/linguistic-features#sbd) have
 | 
					available when [sentence boundaries](/usage/linguistic-features#sbd) have been
 | 
				
			||||||
been set on the document by the `parser`, `senter`, `sentencizer` or some custom
 | 
					set on the document by the `parser`, `senter`, `sentencizer` or some custom
 | 
				
			||||||
function. It will raise an error otherwise.
 | 
					function. It will raise an error otherwise.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
If the span happens to cross sentence boundaries, all sentences the span
 | 
					If the span happens to cross sentence boundaries, all sentences the span
 | 
				
			||||||
| 
						 | 
					@ -542,8 +541,8 @@ overlaps with will be returned.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                                                   |
 | 
					| Name        | Description                                                   |
 | 
				
			||||||
| ----------- | -------------------------------------------------------------------------- |
 | 
					| ----------- | ------------------------------------------------------------- |
 | 
				
			||||||
| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ |
 | 
					| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Attributes {id="attributes"}
 | 
					## Attributes {id="attributes"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -355,14 +355,14 @@ If a setting is not present in the options, the default value will be used.
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name               | Description                                                                                                                                                                                                                                   |
 | 
					| Name               | Description                                                                                                                                                                                                                                   |
 | 
				
			||||||
| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~                                                                                                              |
 | 
					| `fine_grained`     | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~                                                                                                              |
 | 
				
			||||||
| `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                                                                                                                       |
 | 
					| `add_lemma`        | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                                                                                                                       |
 | 
				
			||||||
| `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~                                                                                                  |
 | 
					| `collapse_punct`   | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~                                                                                                  |
 | 
				
			||||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                                                                                                                              |
 | 
					| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                                                                                                                              |
 | 
				
			||||||
| `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
 | 
					| `compact`          | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
 | 
				
			||||||
| `color`            | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                                                                       |
 | 
					| `color`            | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
 | 
				
			||||||
| `bg`               | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~                                                                 |
 | 
					| `bg`               | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
 | 
				
			||||||
| `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
 | 
					| `font`             | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
 | 
				
			||||||
| `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                                                                                                                              |
 | 
					| `offset_x`         | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~                                                                                                                                                                              |
 | 
				
			||||||
| `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                                                                                                                           |
 | 
					| `arrow_stroke`     | Width of arrow path in px. Defaults to `2`. ~~int~~                                                                                                                                                                                           |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1101,7 +1101,7 @@ come directly from
 | 
				
			||||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 | 
					[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Symbol                                  | Description                                                                                                          |
 | 
					| Symbol                                  | Description                                                                                                          |
 | 
				
			||||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
 | 
					| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
 | 
					| `A < B`                                 | `A` is the immediate dependent of `B`.                                                                               |
 | 
				
			||||||
| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
 | 
					| `A > B`                                 | `A` is the immediate head of `B`.                                                                                    |
 | 
				
			||||||
| `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              |
 | 
					| `A << B`                                | `A` is the dependent in a chain to `B` following dep → head paths.                                              |
 | 
				
			||||||
| 
						 | 
					@ -1114,6 +1114,14 @@ come directly from
 | 
				
			||||||
| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
 | 
					| `A $- B`                                | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`.                  |
 | 
				
			||||||
| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
 | 
					| `A $++ B`                               | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`.                                |
 | 
				
			||||||
| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
 | 
					| `A $-- B`                               | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`.                                 |
 | 
				
			||||||
 | 
					| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | 
				
			||||||
 | 
					| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | 
				
			||||||
 | 
					| `A >++ B`                               | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | 
				
			||||||
 | 
					| `A >-- B`                               | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | 
				
			||||||
 | 
					| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_.          |
 | 
				
			||||||
 | 
					| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_.           |
 | 
				
			||||||
 | 
					| `A <++ B`                               | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_.                         |
 | 
				
			||||||
 | 
					| `A <-- B`                               | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_.                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
 | 
					### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1445,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The saved pipeline now includes the `"entity_ruler"` in its
 | 
					The saved pipeline now includes the `"entity_ruler"` in its
 | 
				
			||||||
[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
 | 
					[`config.cfg`](/api/data-formats#config) and the pipeline directory contains a
 | 
				
			||||||
file `entityruler.jsonl` with the patterns. When you load the pipeline back in,
 | 
					file `patterns.jsonl` with the patterns. When you load the pipeline back in, all
 | 
				
			||||||
all pipeline components will be restored and deserialized – including the entity
 | 
					pipeline components will be restored and deserialized – including the entity
 | 
				
			||||||
ruler. This lets you ship powerful pipeline packages with binary weights _and_
 | 
					ruler. This lets you ship powerful pipeline packages with binary weights _and_
 | 
				
			||||||
rules included!
 | 
					rules included!
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -59,10 +59,10 @@ arcs.
 | 
				
			||||||
</Infobox>
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument  | Description                                                                                                                                                                                                                                   |
 | 
					| Argument  | Description                                                                                                                                                                                                                                   |
 | 
				
			||||||
| --------- | ----------------------------------------------------------------------------------------- |
 | 
					| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
 | 
					| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                                                                                                                     |
 | 
				
			||||||
| `color`   | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~                    |
 | 
					| `color`   | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~       |
 | 
				
			||||||
| `bg`      | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~              |
 | 
					| `bg`      | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ |
 | 
				
			||||||
| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
 | 
					| `font`    | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~                                                                                                                                                                         |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
For a list of all available options, see the
 | 
					For a list of all available options, see the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user