diff --git a/.github/ISSUE_TEMPLATE/01_bugs.md b/.github/ISSUE_TEMPLATE/01_bugs.md
index 255a5241e..f0d0ba912 100644
--- a/.github/ISSUE_TEMPLATE/01_bugs.md
+++ b/.github/ISSUE_TEMPLATE/01_bugs.md
@@ -10,7 +10,7 @@ about: Use this template if you came across a bug or unexpected behaviour differ
## Your Environment
-
+
* Operating System:
* Python Version Used:
* spaCy Version Used:
diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index c7722391f..cc0247b3a 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -10,6 +10,7 @@ steps:
inputs:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
+ allowUnstable: true
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
@@ -27,7 +28,7 @@ steps:
- script: python -m mypy spacy
displayName: 'Run mypy'
- condition: ne(variables['python_version'], '3.10')
+ condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1
inputs:
diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 8d0282650..3ad4cf408 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -12,10 +12,10 @@ jobs:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
with:
ref: ${{ github.head_ref }}
- - uses: actions/setup-python@v2
+ - uses: actions/setup-python@v3
- run: pip install black
- name: Auto-format code if needed
run: black spacy
@@ -23,10 +23,11 @@ jobs:
# code and makes GitHub think the action failed
- name: Check for modified files
id: git-check
- run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+ run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
+
- name: Create Pull Request
if: steps.git-check.outputs.modified == 'true'
- uses: peter-evans/create-pull-request@v3
+ uses: peter-evans/create-pull-request@v4
with:
title: Auto-format code with black
labels: meta
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b959262e3..df59697b1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
language_version: python3.7
additional_dependencies: ['click==8.0.4']
- repo: https://gitlab.com/pycqa/flake8
- rev: 3.9.2
+ rev: 5.0.4
hooks:
- id: flake8
args:
diff --git a/README.md b/README.md
index d9ef83e01..abfc3da67 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ be used in real products.
spaCy comes with
[pretrained pipelines](https://spacy.io/models) and
-currently supports tokenization and training for **60+ languages**. It features
+currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license.
-💫 **Version 3.4.0 out now!**
+💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@@ -79,7 +79,7 @@ more people can benefit from it.
## Features
-- Support for **60+ languages**
+- Support for **70+ languages**
- **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f475b7fdd..eea07cb7a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -31,7 +31,7 @@ jobs:
inputs:
versionSpec: "3.7"
- script: |
- pip install flake8==3.9.2
+ pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8"
@@ -76,15 +76,24 @@ jobs:
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
- Python310Linux:
- imageName: "ubuntu-latest"
- python.version: "3.10"
+ # Python310Linux:
+ # imageName: "ubuntu-latest"
+ # python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
- Python310Mac:
- imageName: "macos-latest"
- python.version: "3.10"
+ # Python310Mac:
+ # imageName: "macos-latest"
+ # python.version: "3.10"
+ Python311Linux:
+ imageName: 'ubuntu-latest'
+ python.version: '3.11.0-rc.2'
+ Python311Windows:
+ imageName: 'windows-latest'
+ python.version: '3.11.0-rc.2'
+ Python311Mac:
+ imageName: 'macos-latest'
+ python.version: '3.11.0-rc.2'
maxParallel: 4
pool:
vmImage: $(imageName)
diff --git a/requirements.txt b/requirements.txt
index 3e8501b2f..9d6bbb2c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ pathy>=0.3.5
numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities
@@ -28,11 +28,12 @@ cython>=0.25,<3.0
pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
-flake8>=3.8.0,<3.10.0
+flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
-mypy>=0.910,<0.970; platform_machine!='aarch64'
+mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
+types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
black>=22.0,<23.0
diff --git a/setup.cfg b/setup.cfg
index 2dc5e7042..c2653feba 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,7 +56,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
- pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2
# Official Python utilities
setuptools
diff --git a/setup.py b/setup.py
index c4138aa93..243554c7a 100755
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,9 @@ MOD_NAMES = [
"spacy.lexeme",
"spacy.vocab",
"spacy.attrs",
- "spacy.kb",
+ "spacy.kb.candidate",
+ "spacy.kb.kb",
+ "spacy.kb.kb_in_memory",
"spacy.ml.parser_model",
"spacy.morphology",
"spacy.pipeline.dep_parser",
diff --git a/spacy/__init__.py b/spacy/__init__.py
index d60f46b96..c3568bc5c 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -31,9 +31,9 @@ def load(
name: Union[str, Path],
*,
vocab: Union[Vocab, bool] = True,
- disable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = util.SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = util.SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
diff --git a/spacy/about.py b/spacy/about.py
index 843c15aba..ce86e6294 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.4.1"
+__version__ = "3.4.2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 3506bdde7..ddd6c10ce 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -596,3 +596,12 @@ def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
# It's good to sort these, in case the ordering messes up cache.
locs.sort()
return locs
+
+
+def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
+ """Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
+ as happens with `round(number, ndigits)`"""
+ if isinstance(number, float):
+ return f"{number:.{ndigits}f}"
+ else:
+ return str(number)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index bd05471b1..963d5b926 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -9,7 +9,7 @@ import typer
import math
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
-from ._util import import_code, debug_cli
+from ._util import import_code, debug_cli, _format_number
from ..training import Example, remove_bilu_prefix
from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
@@ -989,7 +989,8 @@ def _get_kl_divergence(p: Counter, q: Counter) -> float:
def _format_span_row(span_data: List[Dict], labels: List[str]) -> List[Any]:
"""Compile into one list for easier reporting"""
d = {
- label: [label] + list(round(d[label], 2) for d in span_data) for label in labels
+ label: [label] + list(_format_number(d[label]) for d in span_data)
+ for label in labels
}
return list(d.values())
@@ -1004,6 +1005,10 @@ def _get_span_characteristics(
label: _gmean(l)
for label, l in compiled_gold["spans_length"][spans_key].items()
}
+ spans_per_type = {
+ label: len(spans)
+ for label, spans in compiled_gold["spans_per_type"][spans_key].items()
+ }
min_lengths = [min(l) for l in compiled_gold["spans_length"][spans_key].values()]
max_lengths = [max(l) for l in compiled_gold["spans_length"][spans_key].values()]
@@ -1031,6 +1036,7 @@ def _get_span_characteristics(
return {
"sd": span_distinctiveness,
"bd": sb_distinctiveness,
+ "spans_per_type": spans_per_type,
"lengths": span_length,
"min_length": min(min_lengths),
"max_length": max(max_lengths),
@@ -1045,12 +1051,15 @@ def _get_span_characteristics(
def _print_span_characteristics(span_characteristics: Dict[str, Any]):
"""Print all span characteristics into a table"""
- headers = ("Span Type", "Length", "SD", "BD")
+ headers = ("Span Type", "Length", "SD", "BD", "N")
+ # Wasabi has this at 30 by default, but we might have some long labels
+ max_col = max(30, max(len(label) for label in span_characteristics["labels"]))
# Prepare table data with all span characteristics
table_data = [
span_characteristics["lengths"],
span_characteristics["sd"],
span_characteristics["bd"],
+ span_characteristics["spans_per_type"],
]
table = _format_span_row(
span_data=table_data, labels=span_characteristics["labels"]
@@ -1061,8 +1070,18 @@ def _print_span_characteristics(span_characteristics: Dict[str, Any]):
span_characteristics["avg_sd"],
span_characteristics["avg_bd"],
]
- footer = ["Wgt. Average"] + [str(round(f, 2)) for f in footer_data]
- msg.table(table, footer=footer, header=headers, divider=True)
+
+ footer = (
+ ["Wgt. Average"] + ["{:.2f}".format(round(f, 2)) for f in footer_data] + ["-"]
+ )
+ msg.table(
+ table,
+ footer=footer,
+ header=headers,
+ divider=True,
+ aligns=["l"] + ["r"] * (len(footer_data) + 1),
+ max_col=max_col,
+ )
def _get_spans_length_freq_dist(
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index b8c8397b6..324c5d1bb 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -299,8 +299,8 @@ def get_meta(
}
nlp = util.load_model_from_path(Path(model_path))
meta.update(nlp.meta)
- meta.update(existing_meta)
meta["spacy_version"] = util.get_minor_version_range(about.__version__)
+ meta.update(existing_meta)
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),
diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py
index 83dc5efbf..a15353855 100644
--- a/spacy/cli/project/dvc.py
+++ b/spacy/cli/project/dvc.py
@@ -25,6 +25,7 @@ def project_update_dvc_cli(
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
+ quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
@@ -36,7 +37,7 @@ def project_update_dvc_cli(
DOCS: https://spacy.io/api/cli#project-dvc
"""
- project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
+ project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc(
@@ -44,6 +45,7 @@ def project_update_dvc(
workflow: Optional[str] = None,
*,
verbose: bool = False,
+ quiet: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
@@ -54,11 +56,12 @@ def project_update_dvc(
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
+ quiet (bool): Print less info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
- project_dir, config, workflow, verbose=verbose, force=force
+ project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
@@ -72,7 +75,7 @@ def update_dvc_config(
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
- silent: bool = False,
+ quiet: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
@@ -83,7 +86,7 @@ def update_dvc_config(
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
- silent (bool): Don't output anything (via DVC).
+ quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
@@ -105,6 +108,14 @@ def update_dvc_config(
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
+
+ # some flags that apply to every command
+ flags = []
+ if verbose:
+ flags.append("--verbose")
+ if quiet:
+ flags.append("--quiet")
+
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
@@ -118,14 +129,26 @@ def update_dvc_config(
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
- dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
+
+ dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
+
+ if not dvc_commands:
+ # If we don't check for this, then there will be an error when reading the
+ # config, since DVC wouldn't create it.
+ msg.fail(
+ "No usable commands for DVC found. This can happen if none of your "
+ "commands have dependencies or outputs.",
+ exits=1,
+ )
+
with working_dir(path):
- dvc_flags = {"--verbose": verbose, "--quiet": silent}
- run_dvc_commands(dvc_commands, flags=dvc_flags)
+ for c in dvc_commands:
+ dvc_command = "dvc " + c
+ run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
@@ -133,26 +156,6 @@ def update_dvc_config(
return True
-def run_dvc_commands(
- commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
-) -> None:
- """Run a sequence of DVC commands in a subprocess, in order.
-
- commands (List[str]): The string commands without the leading "dvc".
- flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
- easier to pass flags like --quiet that depend on a variable or
- command-line setting while avoiding lots of nested conditionals.
- """
- for c in commands:
- command = split_command(c)
- dvc_command = ["dvc", *command]
- # Add the flags if they are set to True
- for flag, is_active in flags.items():
- if is_active:
- dvc_command.append(flag)
- run_command(dvc_command)
-
-
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index d42d95465..ebab7471e 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -1,5 +1,8 @@
-from typing import Optional, List, Dict, Sequence, Any, Iterable
+from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
+import os.path
from pathlib import Path
+
+import pkg_resources
from wasabi import msg
from wasabi.util import locale_escape
import sys
@@ -71,6 +74,12 @@ def project_run(
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
+
+ req_path = project_dir / "requirements.txt"
+ if config.get("check_requirements", True) and os.path.exists(req_path):
+ with req_path.open() as requirements_file:
+ _check_requirements([req.replace("\n", "") for req in requirements_file])
+
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
for cmd in workflows[subcommand]:
@@ -310,3 +319,32 @@ def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional
md5 = get_checksum(file_path) if file_path.exists() else None
data.append({"path": path, "md5": md5})
return data
+
+
+def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
+ """Checks whether requirements are installed and free of version conflicts.
+ requirements (List[str]): List of requirements.
+ RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
+ exist.
+ """
+
+ failed_pkgs_msgs: List[str] = []
+ conflicting_pkgs_msgs: List[str] = []
+
+ for req in requirements:
+ try:
+ pkg_resources.require(req)
+ except pkg_resources.DistributionNotFound as dnf:
+ failed_pkgs_msgs.append(dnf.report())
+ except pkg_resources.VersionConflict as vc:
+ conflicting_pkgs_msgs.append(vc.report())
+
+ if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
+ msg.warn(
+ title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
+ "correctly and you installed all requirements specified in your project's requirements.txt: "
+ )
+ for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
+ msg.text(pgk_msg)
+
+ return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
diff --git a/spacy/errors.py b/spacy/errors.py
index f55b378e9..e0628819d 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -212,6 +212,8 @@ class Warnings(metaclass=ErrorsWithCodes):
W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
"is a Cython extension type.")
+ W123 = ("Argument {arg} with value {arg_value} is used instead of {config_value} as specified in the config. Be "
+ "aware that this might affect other components in your pipeline.")
class Errors(metaclass=ErrorsWithCodes):
@@ -538,6 +540,8 @@ class Errors(metaclass=ErrorsWithCodes):
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
E200 = ("Can't set {attr} from Span.")
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+ E203 = ("If the {name} embedding layer is not updated "
+ "during training, make sure to include it in 'annotating components'")
# New errors added in v3.x
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
@@ -709,9 +713,9 @@ class Errors(metaclass=ErrorsWithCodes):
"`nlp.enable_pipe` instead.")
E927 = ("Can't write to frozen list. Maybe you're trying to modify a computed "
"property or default function argument?")
- E928 = ("A KnowledgeBase can only be serialized to/from from a directory, "
+ E928 = ("An InMemoryLookupKB can only be serialized to/from from a directory, "
"but the provided argument {loc} points to a file.")
- E929 = ("Couldn't read KnowledgeBase from {loc}. The path does not seem to exist.")
+ E929 = ("Couldn't read InMemoryLookupKB from {loc}. The path does not seem to exist.")
E930 = ("Received invalid get_examples callback in `{method}`. "
"Expected function that returns an iterable of Example objects but "
"got: {obj}")
@@ -937,10 +941,17 @@ class Errors(metaclass=ErrorsWithCodes):
E1040 = ("Doc.from_json requires all tokens to have the same attributes. "
"Some tokens do not contain annotation for: {partial_attrs}")
E1041 = ("Expected a string, Doc, or bytes as input, but got: {type}")
- E1042 = ("Function was called with `{arg1}`={arg1_values} and "
- "`{arg2}`={arg2_values} but these arguments are conflicting.")
+ E1042 = ("`enable={enable}` and `disable={disable}` are inconsistent with each other.\nIf you only passed "
+ "one of `enable` or `disable`, the other argument is specified in your pipeline's configuration.\nIn that "
+ "case pass an empty list for the previously not specified argument to avoid this error.")
E1043 = ("Expected None or a value in range [{range_start}, {range_end}] for entity linker threshold, but got "
"{value}.")
+ E1044 = ("Expected `candidates_batch_size` to be >= 1, but got: {value}")
+ E1045 = ("Encountered {parent} subclass without `{parent}.{method}` "
+ "method in '{name}'. If you want to use this method, make "
+ "sure it's overwritten on the subclass.")
+ E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
+ "knowledge base, use `InMemoryLookupKB`.")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
new file mode 100644
index 000000000..1d70a9b34
--- /dev/null
+++ b/spacy/kb/__init__.py
@@ -0,0 +1,3 @@
+from .kb import KnowledgeBase
+from .kb_in_memory import InMemoryLookupKB
+from .candidate import Candidate, get_candidates, get_candidates_batch
diff --git a/spacy/kb/candidate.pxd b/spacy/kb/candidate.pxd
new file mode 100644
index 000000000..942ce9dd0
--- /dev/null
+++ b/spacy/kb/candidate.pxd
@@ -0,0 +1,12 @@
+from .kb cimport KnowledgeBase
+from libcpp.vector cimport vector
+from ..typedefs cimport hash_t
+
+# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
+cdef class Candidate:
+ cdef readonly KnowledgeBase kb
+ cdef hash_t entity_hash
+ cdef float entity_freq
+ cdef vector[float] entity_vector
+ cdef hash_t alias_hash
+ cdef float prior_prob
diff --git a/spacy/kb/candidate.pyx b/spacy/kb/candidate.pyx
new file mode 100644
index 000000000..c89efeb03
--- /dev/null
+++ b/spacy/kb/candidate.pyx
@@ -0,0 +1,74 @@
+# cython: infer_types=True, profile=True
+
+from typing import Iterable
+from .kb cimport KnowledgeBase
+from ..tokens import Span
+
+cdef class Candidate:
+ """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
+ to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
+ algorithm which will disambiguate the various candidates to the correct one.
+ Each candidate (alias, entity) pair is assigned a certain prior probability.
+
+ DOCS: https://spacy.io/api/kb/#candidate-init
+ """
+
+ def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
+ self.kb = kb
+ self.entity_hash = entity_hash
+ self.entity_freq = entity_freq
+ self.entity_vector = entity_vector
+ self.alias_hash = alias_hash
+ self.prior_prob = prior_prob
+
+ @property
+ def entity(self) -> int:
+ """RETURNS (uint64): hash of the entity's KB ID/name"""
+ return self.entity_hash
+
+ @property
+ def entity_(self) -> str:
+ """RETURNS (str): ID/name of this entity in the KB"""
+ return self.kb.vocab.strings[self.entity_hash]
+
+ @property
+ def alias(self) -> int:
+ """RETURNS (uint64): hash of the alias"""
+ return self.alias_hash
+
+ @property
+ def alias_(self) -> str:
+ """RETURNS (str): ID of the original alias"""
+ return self.kb.vocab.strings[self.alias_hash]
+
+ @property
+ def entity_freq(self) -> float:
+ return self.entity_freq
+
+ @property
+ def entity_vector(self) -> Iterable[float]:
+ return self.entity_vector
+
+ @property
+ def prior_prob(self) -> float:
+ return self.prior_prob
+
+
+def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
+ """
+ Return candidate entities for a given mention and fetching appropriate entries from the index.
+ kb (KnowledgeBase): Knowledge base to query.
+ mention (Span): Entity mention for which to identify candidates.
+ RETURNS (Iterable[Candidate]): Identified candidates.
+ """
+ return kb.get_candidates(mention)
+
+
+def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+ """
+ Return candidate entities for the given mentions and fetching appropriate entries from the index.
+ kb (KnowledgeBase): Knowledge base to query.
+ mention (Iterable[Span]): Entity mentions for which to identify candidates.
+ RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+ """
+ return kb.get_candidates_batch(mentions)
diff --git a/spacy/kb/kb.pxd b/spacy/kb/kb.pxd
new file mode 100644
index 000000000..1adeef8ae
--- /dev/null
+++ b/spacy/kb/kb.pxd
@@ -0,0 +1,10 @@
+"""Knowledge-base for entity or concept linking."""
+
+from cymem.cymem cimport Pool
+from libc.stdint cimport int64_t
+from ..vocab cimport Vocab
+
+cdef class KnowledgeBase:
+ cdef Pool mem
+ cdef readonly Vocab vocab
+ cdef readonly int64_t entity_vector_length
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
new file mode 100644
index 000000000..ce4bc0138
--- /dev/null
+++ b/spacy/kb/kb.pyx
@@ -0,0 +1,108 @@
+# cython: infer_types=True, profile=True
+
+from pathlib import Path
+from typing import Iterable, Tuple, Union
+from cymem.cymem cimport Pool
+
+from .candidate import Candidate
+from ..tokens import Span
+from ..util import SimpleFrozenList
+from ..errors import Errors
+
+
+cdef class KnowledgeBase:
+ """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+ to support entity linking of named entities to real-world concepts.
+ This is an abstract class and requires its operations to be implemented.
+
+ DOCS: https://spacy.io/api/kb
+ """
+
+ def __init__(self, vocab: Vocab, entity_vector_length: int):
+ """Create a KnowledgeBase."""
+ # Make sure abstract KB is not instantiated.
+ if self.__class__ == KnowledgeBase:
+ raise TypeError(
+ Errors.E1046.format(cls_name=self.__class__.__name__)
+ )
+
+ self.vocab = vocab
+ self.entity_vector_length = entity_vector_length
+ self.mem = Pool()
+
+ def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+ """
+ Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
+ and the prior probability of that alias resolving to that entity.
+ If no candidate is found for a given text, an empty list is returned.
+ mentions (Iterable[Span]): Mentions for which to get candidates.
+ RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
+ """
+ return [self.get_candidates(span) for span in mentions]
+
+ def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+ """
+ Return candidate entities for specified text. Each candidate defines the entity, the original alias,
+ and the prior probability of that alias resolving to that entity.
+ If the no candidate is found for a given text, an empty list is returned.
+ mention (Span): Mention for which to get candidates.
+ RETURNS (Iterable[Candidate]): Identified candidates.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
+ )
+
+ def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
+ """
+ Return vectors for entities.
+ entity (str): Entity name/ID.
+ RETURNS (Iterable[Iterable[float]]): Vectors for specified entities.
+ """
+ return [self.get_vector(entity) for entity in entities]
+
+ def get_vector(self, str entity) -> Iterable[float]:
+ """
+ Return vector for entity.
+ entity (str): Entity name/ID.
+ RETURNS (Iterable[float]): Vector for specified entity.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
+ )
+
+ def to_bytes(self, **kwargs) -> bytes:
+ """Serialize the current state to a binary string.
+ RETURNS (bytes): Current state as binary string.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
+ )
+
+ def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
+ """Load state from a binary string.
+ bytes_data (bytes): KB state.
+ exclude (Tuple[str]): Properties to exclude when restoring KB.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
+ )
+
+ def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+ """
+ Write KnowledgeBase content to disk.
+ path (Union[str, Path]): Target file path.
+ exclude (Iterable[str]): List of components to exclude.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
+ )
+
+ def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
+ """
+ Load KnowledgeBase content from disk.
+ path (Union[str, Path]): Target file path.
+ exclude (Iterable[str]): List of components to exclude.
+ """
+ raise NotImplementedError(
+ Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
+ )
diff --git a/spacy/kb.pxd b/spacy/kb/kb_in_memory.pxd
similarity index 92%
rename from spacy/kb.pxd
rename to spacy/kb/kb_in_memory.pxd
index a823dbe1e..825a6bde9 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb/kb_in_memory.pxd
@@ -1,14 +1,12 @@
"""Knowledge-base for entity or concept linking."""
-from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE
-from .vocab cimport Vocab
-from .typedefs cimport hash_t
-from .structs cimport KBEntryC, AliasC
-
+from ..typedefs cimport hash_t
+from ..structs cimport KBEntryC, AliasC
+from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec
ctypedef vector[AliasC] alias_vec
@@ -16,21 +14,7 @@ ctypedef vector[float] float_vec
ctypedef vector[float_vec] float_matrix
-# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
-cdef class Candidate:
- cdef readonly KnowledgeBase kb
- cdef hash_t entity_hash
- cdef float entity_freq
- cdef vector[float] entity_vector
- cdef hash_t alias_hash
- cdef float prior_prob
-
-
-cdef class KnowledgeBase:
- cdef Pool mem
- cdef readonly Vocab vocab
- cdef int64_t entity_vector_length
-
+cdef class InMemoryLookupKB(KnowledgeBase):
# This maps 64bit keys (hash of unique entity string)
# to 64bit values (position of the _KBEntryC struct in the _entries vector).
# The PreshMap is pretty space efficient, as it uses open addressing. So
diff --git a/spacy/kb.pyx b/spacy/kb/kb_in_memory.pyx
similarity index 90%
rename from spacy/kb.pyx
rename to spacy/kb/kb_in_memory.pyx
index ae1983a8d..485e52c2f 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -1,8 +1,7 @@
# cython: infer_types=True, profile=True
-from typing import Iterator, Iterable, Callable, Dict, Any
+from typing import Iterable, Callable, Dict, Any, Union
import srsly
-from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
@@ -12,85 +11,28 @@ from libcpp.vector cimport vector
from pathlib import Path
import warnings
-from .typedefs cimport hash_t
-from .errors import Errors, Warnings
-from . import util
-from .util import SimpleFrozenList, ensure_path
-
-cdef class Candidate:
- """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
- to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
- algorithm which will disambiguate the various candidates to the correct one.
- Each candidate (alias, entity) pair is assigned to a certain prior probability.
-
- DOCS: https://spacy.io/api/kb/#candidate_init
- """
-
- def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
- self.kb = kb
- self.entity_hash = entity_hash
- self.entity_freq = entity_freq
- self.entity_vector = entity_vector
- self.alias_hash = alias_hash
- self.prior_prob = prior_prob
-
- @property
- def entity(self):
- """RETURNS (uint64): hash of the entity's KB ID/name"""
- return self.entity_hash
-
- @property
- def entity_(self):
- """RETURNS (str): ID/name of this entity in the KB"""
- return self.kb.vocab.strings[self.entity_hash]
-
- @property
- def alias(self):
- """RETURNS (uint64): hash of the alias"""
- return self.alias_hash
-
- @property
- def alias_(self):
- """RETURNS (str): ID of the original alias"""
- return self.kb.vocab.strings[self.alias_hash]
-
- @property
- def entity_freq(self):
- return self.entity_freq
-
- @property
- def entity_vector(self):
- return self.entity_vector
-
- @property
- def prior_prob(self):
- return self.prior_prob
+from ..tokens import Span
+from ..typedefs cimport hash_t
+from ..errors import Errors, Warnings
+from .. import util
+from ..util import SimpleFrozenList, ensure_path
+from ..vocab cimport Vocab
+from .kb cimport KnowledgeBase
+from .candidate import Candidate as Candidate
-def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]:
- """
- Return candidate entities for a given span by using the text of the span as the alias
- and fetching appropriate entries from the index.
- This particular function is optimized to work with the built-in KB functionality,
- but any other custom candidate generation method can be used in combination with the KB as well.
- """
- return kb.get_alias_candidates(span.text)
-
-
-cdef class KnowledgeBase:
- """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
+cdef class InMemoryLookupKB(KnowledgeBase):
+ """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
- DOCS: https://spacy.io/api/kb
+ DOCS: https://spacy.io/api/kb_in_memory
"""
def __init__(self, Vocab vocab, entity_vector_length):
- """Create a KnowledgeBase."""
- self.mem = Pool()
- self.entity_vector_length = entity_vector_length
+ """Create an InMemoryLookupKB."""
+ super().__init__(vocab, entity_vector_length)
self._entry_index = PreshMap()
self._alias_index = PreshMap()
- self.vocab = vocab
self._create_empty_vectors(dummy_hash=self.vocab.strings[""])
def _initialize_entities(self, int64_t nr_entities):
@@ -104,11 +46,6 @@ cdef class KnowledgeBase:
self._alias_index = PreshMap(nr_aliases + 1)
self._aliases_table = alias_vec(nr_aliases + 1)
- @property
- def entity_vector_length(self):
- """RETURNS (uint64): length of the entity vectors"""
- return self.entity_vector_length
-
def __len__(self):
return self.get_size_entities()
@@ -286,7 +223,10 @@ cdef class KnowledgeBase:
alias_entry.probs = probs
self._aliases_table[alias_index] = alias_entry
- def get_alias_candidates(self, str alias) -> Iterator[Candidate]:
+ def get_candidates(self, mention: Span) -> Iterable[Candidate]:
+ return self.get_alias_candidates(mention.text) # type: ignore
+
+ def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
"""
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py
index 2fd012912..0f15e6e65 100644
--- a/spacy/lang/ca/lemmatizer.py
+++ b/spacy/lang/ca/lemmatizer.py
@@ -72,10 +72,10 @@ class CatalanLemmatizer(Lemmatizer):
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
- if not forms and string in lookup_table.keys():
- forms.append(self.lookup_lemmatize(token)[0])
+
+ # use lookups, and fall back to the token itself
if not forms:
- forms.append(string)
+ forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 1d204c46c..37c58c85f 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -280,7 +280,7 @@ _currency = (
_punct = (
r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ۔ ؛ ٪"
)
-_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
+_quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉 〈 〉 ⟦ ⟧'
_hyphens = "- – — -- --- —— ~"
# Various symbols like dingbats, but also emoji
diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py
index c6422cf96..a7cbe0bcf 100644
--- a/spacy/lang/fr/lemmatizer.py
+++ b/spacy/lang/fr/lemmatizer.py
@@ -53,11 +53,16 @@ class FrenchLemmatizer(Lemmatizer):
rules = rules_table.get(univ_pos, [])
string = string.lower()
forms = []
+ # first try lookup in table based on upos
if string in index:
forms.append(string)
self.cache[cache_key] = forms
return forms
+
+ # then add anything in the exceptions table
forms.extend(exceptions.get(string, []))
+
+ # if nothing found yet, use the rules
oov_forms = []
if not forms:
for old, new in rules:
@@ -69,12 +74,14 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(form)
else:
oov_forms.append(form)
+
+ # if still nothing, add the oov forms from rules
if not forms:
forms.extend(oov_forms)
- if not forms and string in lookup_table.keys():
- forms.append(self.lookup_lemmatize(token)[0])
+
+ # use lookups, which fall back to the token itself
if not forms:
- forms.append(string)
+ forms.append(lookup_table.get(string, [string])[0])
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms
diff --git a/spacy/lang/grc/__init__.py b/spacy/lang/grc/__init__.py
index e83f0c5a5..019b3802e 100644
--- a/spacy/lang/grc/__init__.py
+++ b/spacy/lang/grc/__init__.py
@@ -1,11 +1,15 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from ...language import Language, BaseDefaults
class AncientGreekDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ prefixes = TOKENIZER_PREFIXES
+ suffixes = TOKENIZER_SUFFIXES
+ infixes = TOKENIZER_INFIXES
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
diff --git a/spacy/lang/grc/punctuation.py b/spacy/lang/grc/punctuation.py
new file mode 100644
index 000000000..8f3589e9a
--- /dev/null
+++ b/spacy/lang/grc/punctuation.py
@@ -0,0 +1,46 @@
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
+from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from ..char_classes import CONCAT_QUOTES
+
+_prefixes = (
+ [
+ "†",
+ "⸏",
+ ]
+ + LIST_PUNCT
+ + LIST_ELLIPSES
+ + LIST_QUOTES
+ + LIST_CURRENCY
+ + LIST_ICONS
+)
+
+_suffixes = (
+ LIST_PUNCT
+ + LIST_ELLIPSES
+ + LIST_QUOTES
+ + LIST_ICONS
+ + [
+ "†",
+ "⸎",
+ r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])[\-\.⸏]",
+ ]
+)
+
+_infixes = (
+ LIST_ELLIPSES
+ + LIST_ICONS
+ + [
+ r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+ r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+ al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+ ),
+ r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+ r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+ r"(?<=[\u1F00-\u1FFF\u0370-\u03FF])—",
+ ]
+)
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index c118c26ff..7d17628c4 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -28,7 +28,7 @@ class Russian(Language):
assigns=["token.lemma"],
default_config={
"model": None,
- "mode": "pymorphy2",
+ "mode": "pymorphy3",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 85180b1e4..c37a3a91a 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -19,11 +19,11 @@ class RussianLemmatizer(Lemmatizer):
model: Optional[Model],
name: str = "lemmatizer",
*,
- mode: str = "pymorphy2",
+ mode: str = "pymorphy3",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
- if mode == "pymorphy2":
+ if mode in {"pymorphy2", "pymorphy2_lookup"}:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
@@ -33,6 +33,16 @@ class RussianLemmatizer(Lemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer()
+ elif mode == "pymorphy3":
+ try:
+ from pymorphy3 import MorphAnalyzer
+ except ImportError:
+ raise ImportError(
+ "The Russian lemmatizer mode 'pymorphy3' requires the "
+ "pymorphy3 library. Install it with: pip install pymorphy3"
+ ) from None
+ if getattr(self, "_morph", None) is None:
+ self._morph = MorphAnalyzer()
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
@@ -104,6 +114,9 @@ class RussianLemmatizer(Lemmatizer):
return [analyses[0].normal_form]
return [string]
+ def pymorphy3_lemmatize(self, token: Token) -> List[str]:
+ return self.pymorphy2_lemmatize(token)
+
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
gram_map = {
diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py
index 9ddd676bf..0070e9fa1 100644
--- a/spacy/lang/sl/__init__.py
+++ b/spacy/lang/sl/__init__.py
@@ -1,9 +1,17 @@
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class SlovenianDefaults(BaseDefaults):
stop_words = STOP_WORDS
+ tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+ prefixes = TOKENIZER_PREFIXES
+ infixes = TOKENIZER_INFIXES
+ suffixes = TOKENIZER_SUFFIXES
+ lex_attr_getters = LEX_ATTRS
class Slovenian(Language):
diff --git a/spacy/lang/sl/lex_attrs.py b/spacy/lang/sl/lex_attrs.py
new file mode 100644
index 000000000..958152e37
--- /dev/null
+++ b/spacy/lang/sl/lex_attrs.py
@@ -0,0 +1,145 @@
+from ...attrs import LIKE_NUM
+from ...attrs import IS_CURRENCY
+import unicodedata
+
+
+_num_words = set(
+ """
+ nula ničla nič ena dva tri štiri pet šest sedem osem
+ devet deset enajst dvanajst trinajst štirinajst petnajst
+ šestnajst sedemnajst osemnajst devetnajst dvajset trideset štirideset
+ petdeset šestdest sedemdeset osemdeset devedeset sto tisoč
+ milijon bilijon trilijon kvadrilijon nešteto
+
+ en eden enega enemu ennem enim enih enima enimi ene eni eno
+ dveh dvema dvem dvoje trije treh trem tremi troje štirje štirih štirim štirimi
+ petih petim petimi šestih šestim šestimi sedmih sedmim sedmimi osmih osmim osmimi
+ devetih devetim devetimi desetih desetim desetimi enajstih enajstim enajstimi
+ dvanajstih dvanajstim dvanajstimi trinajstih trinajstim trinajstimi
+ šestnajstih šestnajstim šestnajstimi petnajstih petnajstim petnajstimi
+ sedemnajstih sedemnajstim sedemnajstimi osemnajstih osemnajstim osemnajstimi
+ devetnajstih devetnajstim devetnajstimi dvajsetih dvajsetim dvajsetimi
+ """.split()
+)
+
+_ordinal_words = set(
+ """
+ prvi drugi tretji četrti peti šesti sedmi osmi
+ deveti deseti enajsti dvanajsti trinajsti štirinajsti
+ petnajsti šestnajsti sedemnajsti osemnajsti devetnajsti
+ dvajseti trideseti štirideseti petdeseti šestdeseti sedemdeseti
+ osemdeseti devetdeseti stoti tisoči milijonti bilijonti
+ trilijonti kvadrilijonti nešteti
+
+ prva druga tretja četrta peta šesta sedma osma
+ deveta deseta enajsta dvanajsta trinajsta štirnajsta
+ petnajsta šestnajsta sedemnajsta osemnajsta devetnajsta
+ dvajseta trideseta štirideseta petdeseta šestdeseta sedemdeseta
+ osemdeseta devetdeseta stota tisoča milijonta bilijonta
+ trilijonta kvadrilijonta nešteta
+
+ prvo drugo tretje četrto peto šestro sedmo osmo
+ deveto deseto enajsto dvanajsto trinajsto štirnajsto
+ petnajsto šestnajsto sedemnajsto osemnajsto devetnajsto
+ dvajseto trideseto štirideseto petdeseto šestdeseto sedemdeseto
+ osemdeseto devetdeseto stoto tisočo milijonto bilijonto
+ trilijonto kvadrilijonto nešteto
+
+ prvega drugega tretjega četrtega petega šestega sedmega osmega
+ devega desetega enajstega dvanajstega trinajstega štirnajstega
+ petnajstega šestnajstega sedemnajstega osemnajstega devetnajstega
+ dvajsetega tridesetega štiridesetega petdesetega šestdesetega sedemdesetega
+ osemdesetega devetdesetega stotega tisočega milijontega bilijontega
+ trilijontega kvadrilijontega neštetega
+
+ prvemu drugemu tretjemu četrtemu petemu šestemu sedmemu osmemu devetemu desetemu
+ enajstemu dvanajstemu trinajstemu štirnajstemu petnajstemu šestnajstemu sedemnajstemu
+ osemnajstemu devetnajstemu dvajsetemu tridesetemu štiridesetemu petdesetemu šestdesetemu
+ sedemdesetemu osemdesetemu devetdesetemu stotemu tisočemu milijontemu bilijontemu
+ trilijontemu kvadrilijontemu neštetemu
+
+ prvem drugem tretjem četrtem petem šestem sedmem osmem devetem desetem
+ enajstem dvanajstem trinajstem štirnajstem petnajstem šestnajstem sedemnajstem
+ osemnajstem devetnajstem dvajsetem tridesetem štiridesetem petdesetem šestdesetem
+ sedemdesetem osemdesetem devetdesetem stotem tisočem milijontem bilijontem
+ trilijontem kvadrilijontem neštetem
+
+ prvim drugim tretjim četrtim petim šestim sedtim osmim devetim desetim
+ enajstim dvanajstim trinajstim štirnajstim petnajstim šestnajstim sedemnajstim
+ osemnajstim devetnajstim dvajsetim tridesetim štiridesetim petdesetim šestdesetim
+ sedemdesetim osemdesetim devetdesetim stotim tisočim milijontim bilijontim
+ trilijontim kvadrilijontim neštetim
+
+ prvih drugih tretjih četrthih petih šestih sedmih osmih deveth desetih
+ enajstih dvanajstih trinajstih štirnajstih petnajstih šestnajstih sedemnajstih
+ osemnajstih devetnajstih dvajsetih tridesetih štiridesetih petdesetih šestdesetih
+ sedemdesetih osemdesetih devetdesetih stotih tisočih milijontih bilijontih
+ trilijontih kvadrilijontih nešteth
+
+ prvima drugima tretjima četrtima petima šestima sedmima osmima devetima desetima
+ enajstima dvanajstima trinajstima štirnajstima petnajstima šestnajstima sedemnajstima
+ osemnajstima devetnajstima dvajsetima tridesetima štiridesetima petdesetima šestdesetima
+ sedemdesetima osemdesetima devetdesetima stotima tisočima milijontima bilijontima
+ trilijontima kvadrilijontima neštetima
+
+ prve druge četrte pete šeste sedme osme devete desete
+ enajste dvanajste trinajste štirnajste petnajste šestnajste sedemnajste
+ osemnajste devetnajste dvajsete tridesete štiridesete petdesete šestdesete
+ sedemdesete osemdesete devetdesete stote tisoče milijonte bilijonte
+ trilijonte kvadrilijonte neštete
+
+ prvimi drugimi tretjimi četrtimi petimi šestimi sedtimi osmimi devetimi desetimi
+ enajstimi dvanajstimi trinajstimi štirnajstimi petnajstimi šestnajstimi sedemnajstimi
+ osemnajstimi devetnajstimi dvajsetimi tridesetimi štiridesetimi petdesetimi šestdesetimi
+ sedemdesetimi osemdesetimi devetdesetimi stotimi tisočimi milijontimi bilijontimi
+ trilijontimi kvadrilijontimi neštetimi
+ """.split()
+)
+
+_currency_words = set(
+ """
+ evro evra evru evrom evrov evroma evrih evrom evre evri evr eur
+ cent centa centu cenom centov centoma centih centom cente centi
+ dolar dolarja dolarji dolarju dolarjem dolarjev dolarjema dolarjih dolarje usd
+ tolar tolarja tolarji tolarju tolarjem tolarjev tolarjema tolarjih tolarje tol
+ dinar dinarja dinarji dinarju dinarjem dinarjev dinarjema dinarjih dinarje din
+ funt funta funti funtu funtom funtov funtoma funtih funte gpb
+ forint forinta forinti forintu forintom forintov forintoma forintih forinte
+ zlot zlota zloti zlotu zlotom zlotov zlotoma zlotih zlote
+ rupij rupija rupiji rupiju rupijem rupijev rupijema rupijih rupije
+ jen jena jeni jenu jenom jenov jenoma jenih jene
+ kuna kuni kune kuno kun kunama kunah kunam kunami
+ marka marki marke markama markah markami
+ """.split()
+)
+
+
+def like_num(text):
+ if text.startswith(("+", "-", "±", "~")):
+ text = text[1:]
+ text = text.replace(",", "").replace(".", "")
+ if text.isdigit():
+ return True
+ if text.count("/") == 1:
+ num, denom = text.split("/")
+ if num.isdigit() and denom.isdigit():
+ return True
+ text_lower = text.lower()
+ if text_lower in _num_words:
+ return True
+ if text_lower in _ordinal_words:
+ return True
+ return False
+
+
+def is_currency(text):
+ text_lower = text.lower()
+ if text in _currency_words:
+ return True
+ for char in text:
+ if unicodedata.category(char) != "Sc":
+ return False
+ return True
+
+
+LEX_ATTRS = {LIKE_NUM: like_num, IS_CURRENCY: is_currency}
diff --git a/spacy/lang/sl/punctuation.py b/spacy/lang/sl/punctuation.py
new file mode 100644
index 000000000..b6ca1830e
--- /dev/null
+++ b/spacy/lang/sl/punctuation.py
@@ -0,0 +1,84 @@
+from ..char_classes import (
+ LIST_ELLIPSES,
+ LIST_ICONS,
+ HYPHENS,
+ LIST_PUNCT,
+ LIST_QUOTES,
+ CURRENCY,
+ UNITS,
+ PUNCT,
+ LIST_CURRENCY,
+ CONCAT_QUOTES,
+)
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+from ..char_classes import merge_chars
+from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
+
+
+INCLUDE_SPECIAL = ["\\+", "\\/", "\\•", "\\¯", "\\=", "\\×"] + HYPHENS.split("|")
+
+_prefixes = INCLUDE_SPECIAL + BASE_TOKENIZER_PREFIXES
+
+_suffixes = (
+ INCLUDE_SPECIAL
+ + LIST_PUNCT
+ + LIST_ELLIPSES
+ + LIST_QUOTES
+ + LIST_ICONS
+ + [
+ r"(?<=°[FfCcKk])\.",
+ r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+ r"(?<=[0-9])(?:{u})".format(u=UNITS),
+ r"(?<=[{al}{e}{p}(?:{q})])\.".format(
+ al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+ ),
+ r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+ # split initials like J.K. Rowling
+ r"(?<=[A-Z]\.)(?:[A-Z].)",
+ ]
+)
+
+# a list of all suffixes following a hyphen that are shouldn't split (eg. BTC-jev)
+# source: Obeliks tokenizer - https://github.com/clarinsi/obeliks/blob/master/obeliks/res/TokRulesPart1.txt
+CONCAT_QUOTES = CONCAT_QUOTES.replace("'", "")
+HYPHENS_PERMITTED = (
+ "((a)|(evemu)|(evskega)|(i)|(jevega)|(jevska)|(jevskimi)|(jinemu)|(oma)|(ovim)|"
+ "(ovski)|(e)|(evi)|(evskem)|(ih)|(jevem)|(jevske)|(jevsko)|(jini)|(ov)|(ovima)|"
+ "(ovskih)|(em)|(evih)|(evskemu)|(ja)|(jevemu)|(jevskega)|(ji)|(jinih)|(ova)|"
+ "(ovimi)|(ovskim)|(ema)|(evim)|(evski)|(je)|(jevi)|(jevskem)|(jih)|(jinim)|"
+ "(ove)|(ovo)|(ovskima)|(ev)|(evima)|(evskih)|(jem)|(jevih)|(jevskemu)|(jin)|"
+ "(jinima)|(ovega)|(ovska)|(ovskimi)|(eva)|(evimi)|(evskim)|(jema)|(jevim)|"
+ "(jevski)|(jina)|(jinimi)|(ovem)|(ovske)|(ovsko)|(eve)|(evo)|(evskima)|(jev)|"
+ "(jevima)|(jevskih)|(jine)|(jino)|(ovemu)|(ovskega)|(u)|(evega)|(evska)|"
+ "(evskimi)|(jeva)|(jevimi)|(jevskim)|(jinega)|(ju)|(ovi)|(ovskem)|(evem)|"
+ "(evske)|(evsko)|(jeve)|(jevo)|(jevskima)|(jinem)|(om)|(ovih)|(ovskemu)|"
+ "(ovec)|(ovca)|(ovcu)|(ovcem)|(ovcev)|(ovcema)|(ovcih)|(ovci)|(ovce)|(ovcimi)|"
+ "(evec)|(evca)|(evcu)|(evcem)|(evcev)|(evcema)|(evcih)|(evci)|(evce)|(evcimi)|"
+ "(jevec)|(jevca)|(jevcu)|(jevcem)|(jevcev)|(jevcema)|(jevcih)|(jevci)|(jevce)|"
+ "(jevcimi)|(ovka)|(ovke)|(ovki)|(ovko)|(ovk)|(ovkama)|(ovkah)|(ovkam)|(ovkami)|"
+ "(evka)|(evke)|(evki)|(evko)|(evk)|(evkama)|(evkah)|(evkam)|(evkami)|(jevka)|"
+ "(jevke)|(jevki)|(jevko)|(jevk)|(jevkama)|(jevkah)|(jevkam)|(jevkami)|(timi)|"
+ "(im)|(ima)|(a)|(imi)|(e)|(o)|(ega)|(ti)|(em)|(tih)|(emu)|(tim)|(i)|(tima)|"
+ "(ih)|(ta)|(te)|(to)|(tega)|(tem)|(temu))"
+)
+
+_infixes = (
+ LIST_ELLIPSES
+ + LIST_ICONS
+ + [
+ r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+ r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+ al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+ ),
+ r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+ r"(?<=[{a}0-9])(?:{h})(?!{hp}$)(?=[{a}])".format(
+ a=ALPHA, h=HYPHENS, hp=HYPHENS_PERMITTED
+ ),
+ r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+ ]
+)
+
+
+TOKENIZER_PREFIXES = _prefixes
+TOKENIZER_SUFFIXES = _suffixes
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py
index c9004ed5d..8491efcb5 100644
--- a/spacy/lang/sl/stop_words.py
+++ b/spacy/lang/sl/stop_words.py
@@ -1,326 +1,84 @@
# Source: https://github.com/stopwords-iso/stopwords-sl
-# Removed various words that are not normally considered stop words, such as months.
STOP_WORDS = set(
"""
-a
-ali
-b
-bi
-bil
-bila
-bile
-bili
-bilo
-biti
-blizu
-bo
-bodo
-bolj
-bom
-bomo
-boste
-bova
-boš
-brez
-c
-cel
-cela
-celi
-celo
-d
-da
-daleč
-dan
-danes
-do
-dober
-dobra
-dobri
-dobro
-dokler
-dol
-dovolj
-e
-eden
-en
-ena
-ene
-eni
-enkrat
-eno
-etc.
+a ali
+
+b bi bil bila bile bili bilo biti blizu bo bodo bojo bolj bom bomo
+boste bova boš brez
+
+c cel cela celi celo
+
+č če često četrta četrtek četrti četrto čez čigav
+
+d da daleč dan danes datum deset deseta deseti deseto devet
+deveta deveti deveto do dober dobra dobri dobro dokler dol dolg
+dolga dolgi dovolj drug druga drugi drugo dva dve
+
+e eden en ena ene eni enkrat eno etc.
+
f
-g
-g.
-ga
-ga.
-gor
-gospa
-gospod
-h
-halo
-i
-idr.
-ii
-iii
-in
-iv
-ix
-iz
-j
-jaz
-je
-ji
-jih
-jim
-jo
-k
-kadarkoli
-kaj
-kajti
-kako
-kakor
-kamor
-kamorkoli
-kar
-karkoli
-katerikoli
-kdaj
-kdo
-kdorkoli
-ker
-ki
-kje
-kjer
-kjerkoli
-ko
-koderkoli
-koga
-komu
-kot
-l
-le
-lep
-lepa
-lepe
-lepi
-lepo
-m
-manj
-me
-med
-medtem
-mene
-mi
-midva
-midve
-mnogo
-moj
-moja
-moje
-mora
-morajo
-moram
-moramo
-morate
-moraš
-morem
-mu
-n
-na
-nad
-naj
-najina
-najino
-najmanj
-naju
-največ
-nam
-nas
-nato
-nazaj
-naš
-naša
-naše
-ne
-nedavno
-nek
-neka
-nekaj
-nekatere
-nekateri
-nekatero
-nekdo
-neke
-nekega
-neki
-nekje
-neko
-nekoga
-nekoč
-ni
-nikamor
-nikdar
-nikjer
-nikoli
-nič
-nje
-njega
-njegov
-njegova
-njegovo
-njej
-njemu
-njen
-njena
-njeno
-nji
-njih
-njihov
-njihova
-njihovo
-njiju
-njim
-njo
-njun
-njuna
-njuno
-no
-nocoj
-npr.
-o
-ob
-oba
-obe
-oboje
-od
-okoli
-on
-onadva
-one
-oni
-onidve
-oz.
-p
-pa
-po
-pod
-pogosto
-poleg
-ponavadi
-ponovno
-potem
-povsod
-prbl.
-precej
-pred
-prej
-preko
-pri
-pribl.
-približno
-proti
-r
-redko
-res
-s
-saj
-sam
-sama
-same
-sami
-samo
-se
-sebe
-sebi
-sedaj
-sem
-seveda
-si
-sicer
-skoraj
-skozi
-smo
-so
-spet
-sta
-ste
-sva
-t
-ta
-tak
-taka
-take
-taki
-tako
-takoj
-tam
-te
-tebe
-tebi
-tega
-ti
-tista
-tiste
-tisti
-tisto
-tj.
-tja
-to
-toda
-tu
-tudi
-tukaj
-tvoj
-tvoja
-tvoje
+
+g g. ga ga. gor gospa gospod
+
+h halo
+
+i idr. ii iii in iv ix iz
+
+j jaz je ji jih jim jo jutri
+
+k kadarkoli kaj kajti kako kakor kamor kamorkoli kar karkoli
+katerikoli kdaj kdo kdorkoli ker ki kje kjer kjerkoli
+ko koder koderkoli koga komu kot kratek kratka kratke kratki
+
+l lahka lahke lahki lahko le lep lepa lepe lepi lepo leto
+
+m majhen majhna majhni malce malo manj me med medtem mene
+mesec mi midva midve mnogo moj moja moje mora morajo moram
+moramo morate moraš morem mu
+
+n na nad naj najina najino najmanj naju največ nam narobe
+nas nato nazaj naš naša naše ne nedavno nedelja nek neka
+nekaj nekatere nekateri nekatero nekdo neke nekega neki
+nekje neko nekoga nekoč ni nikamor nikdar nikjer nikoli
+nič nje njega njegov njegova njegovo njej njemu njen
+njena njeno nji njih njihov njihova njihovo njiju njim
+njo njun njuna njuno no nocoj npr.
+
+o ob oba obe oboje od odprt odprta odprti okoli on
+onadva one oni onidve osem osma osmi osmo oz.
+
+p pa pet peta petek peti peto po pod pogosto poleg poln
+polna polni polno ponavadi ponedeljek ponovno potem
+povsod pozdravljen pozdravljeni prav prava prave pravi
+pravo prazen prazna prazno prbl. precej pred prej preko
+pri pribl. približno primer pripravljen pripravljena
+pripravljeni proti prva prvi prvo
+
+r ravno redko res reč
+
+s saj sam sama same sami samo se sebe sebi sedaj sedem
+sedma sedmi sedmo sem seveda si sicer skoraj skozi slab sm
+so sobota spet sreda srednja srednji sta ste stran stvar sva
+
+š šest šesta šesti šesto štiri
+
+t ta tak taka take taki tako takoj tam te tebe tebi tega
+težak težka težki težko ti tista tiste tisti tisto tj.
+tja to toda torek tretja tretje tretji tri tu tudi tukaj
+tvoj tvoja tvoje
+
u
-v
-vaju
-vam
-vas
-vaš
-vaša
-vaše
-ve
-vedno
-vendar
-ves
-več
-vi
-vidva
-vii
-viii
-vsa
-vsaj
-vsak
-vsaka
-vsakdo
-vsake
-vsaki
-vsakomur
-vse
-vsega
-vsi
-vso
-včasih
-x
-z
-za
-zadaj
-zadnji
-zakaj
-zdaj
-zelo
-zunaj
-č
-če
-često
-čez
-čigav
-š
-ž
-že
+
+v vaju vam vas vaš vaša vaše ve vedno velik velika veliki
+veliko vendar ves več vi vidva vii viii visok visoka visoke
+visoki vsa vsaj vsak vsaka vsakdo vsake vsaki vsakomur vse
+vsega vsi vso včasih včeraj
+
+x
+
+z za zadaj zadnji zakaj zaprta zaprti zaprto zdaj zelo zunaj
+
+ž že
""".split()
)
diff --git a/spacy/lang/sl/tokenizer_exceptions.py b/spacy/lang/sl/tokenizer_exceptions.py
new file mode 100644
index 000000000..3d4109228
--- /dev/null
+++ b/spacy/lang/sl/tokenizer_exceptions.py
@@ -0,0 +1,272 @@
+from typing import Dict, List
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...symbols import ORTH, NORM
+from ...util import update_exc
+
+_exc: Dict[str, List[Dict]] = {}
+
+_other_exc = {
+ "t.i.": [{ORTH: "t.", NORM: "tako"}, {ORTH: "i.", NORM: "imenovano"}],
+ "t.j.": [{ORTH: "t.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
+ "T.j.": [{ORTH: "T.", NORM: "to"}, {ORTH: "j.", NORM: "je"}],
+ "d.o.o.": [
+ {ORTH: "d.", NORM: "družba"},
+ {ORTH: "o.", NORM: "omejeno"},
+ {ORTH: "o.", NORM: "odgovornostjo"},
+ ],
+ "D.O.O.": [
+ {ORTH: "D.", NORM: "družba"},
+ {ORTH: "O.", NORM: "omejeno"},
+ {ORTH: "O.", NORM: "odgovornostjo"},
+ ],
+ "d.n.o.": [
+ {ORTH: "d.", NORM: "družba"},
+ {ORTH: "n.", NORM: "neomejeno"},
+ {ORTH: "o.", NORM: "odgovornostjo"},
+ ],
+ "D.N.O.": [
+ {ORTH: "D.", NORM: "družba"},
+ {ORTH: "N.", NORM: "neomejeno"},
+ {ORTH: "O.", NORM: "odgovornostjo"},
+ ],
+ "d.d.": [{ORTH: "d.", NORM: "delniška"}, {ORTH: "d.", NORM: "družba"}],
+ "D.D.": [{ORTH: "D.", NORM: "delniška"}, {ORTH: "D.", NORM: "družba"}],
+ "s.p.": [{ORTH: "s.", NORM: "samostojni"}, {ORTH: "p.", NORM: "podjetnik"}],
+ "S.P.": [{ORTH: "S.", NORM: "samostojni"}, {ORTH: "P.", NORM: "podjetnik"}],
+ "l.r.": [{ORTH: "l.", NORM: "lastno"}, {ORTH: "r.", NORM: "ročno"}],
+ "le-te": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "te"}],
+ "Le-te": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "te"}],
+ "le-ti": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ti"}],
+ "Le-ti": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ti"}],
+ "le-to": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "to"}],
+ "Le-to": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "to"}],
+ "le-ta": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "ta"}],
+ "Le-ta": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "ta"}],
+ "le-tega": [{ORTH: "le"}, {ORTH: "-"}, {ORTH: "tega"}],
+ "Le-tega": [{ORTH: "Le"}, {ORTH: "-"}, {ORTH: "tega"}],
+}
+
+_exc.update(_other_exc)
+
+
+for exc_data in [
+ {ORTH: "adm.", NORM: "administracija"},
+ {ORTH: "aer.", NORM: "aeronavtika"},
+ {ORTH: "agr.", NORM: "agronomija"},
+ {ORTH: "amer.", NORM: "ameriško"},
+ {ORTH: "anat.", NORM: "anatomija"},
+ {ORTH: "angl.", NORM: "angleški"},
+ {ORTH: "ant.", NORM: "antonim"},
+ {ORTH: "antr.", NORM: "antropologija"},
+ {ORTH: "apr.", NORM: "april"},
+ {ORTH: "arab.", NORM: "arabsko"},
+ {ORTH: "arheol.", NORM: "arheologija"},
+ {ORTH: "arhit.", NORM: "arhitektura"},
+ {ORTH: "avg.", NORM: "avgust"},
+ {ORTH: "avstr.", NORM: "avstrijsko"},
+ {ORTH: "avt.", NORM: "avtomobilizem"},
+ {ORTH: "bibl.", NORM: "biblijsko"},
+ {ORTH: "biokem.", NORM: "biokemija"},
+ {ORTH: "biol.", NORM: "biologija"},
+ {ORTH: "bolg.", NORM: "bolgarski"},
+ {ORTH: "bot.", NORM: "botanika"},
+ {ORTH: "cit.", NORM: "citat"},
+ {ORTH: "daj.", NORM: "dajalnik"},
+ {ORTH: "del.", NORM: "deležnik"},
+ {ORTH: "ed.", NORM: "ednina"},
+ {ORTH: "etn.", NORM: "etnografija"},
+ {ORTH: "farm.", NORM: "farmacija"},
+ {ORTH: "filat.", NORM: "filatelija"},
+ {ORTH: "filoz.", NORM: "filozofija"},
+ {ORTH: "fin.", NORM: "finančništvo"},
+ {ORTH: "fiz.", NORM: "fizika"},
+ {ORTH: "fot.", NORM: "fotografija"},
+ {ORTH: "fr.", NORM: "francoski"},
+ {ORTH: "friz.", NORM: "frizerstvo"},
+ {ORTH: "gastr.", NORM: "gastronomija"},
+ {ORTH: "geogr.", NORM: "geografija"},
+ {ORTH: "geol.", NORM: "geologija"},
+ {ORTH: "geom.", NORM: "geometrija"},
+ {ORTH: "germ.", NORM: "germanski"},
+ {ORTH: "gl.", NORM: "glej"},
+ {ORTH: "glag.", NORM: "glagolski"},
+ {ORTH: "glasb.", NORM: "glasba"},
+ {ORTH: "gled.", NORM: "gledališče"},
+ {ORTH: "gost.", NORM: "gostinstvo"},
+ {ORTH: "gozd.", NORM: "gozdarstvo"},
+ {ORTH: "gr.", NORM: "grški"},
+ {ORTH: "grad.", NORM: "gradbeništvo"},
+ {ORTH: "hebr.", NORM: "hebrejsko"},
+ {ORTH: "hrv.", NORM: "hrvaško"},
+ {ORTH: "ide.", NORM: "indoevropsko"},
+ {ORTH: "igr.", NORM: "igre"},
+ {ORTH: "im.", NORM: "imenovalnik"},
+ {ORTH: "iron.", NORM: "ironično"},
+ {ORTH: "it.", NORM: "italijanski"},
+ {ORTH: "itd.", NORM: "in tako dalje"},
+ {ORTH: "itn.", NORM: "in tako naprej"},
+ {ORTH: "ipd.", NORM: "in podobno"},
+ {ORTH: "jap.", NORM: "japonsko"},
+ {ORTH: "jul.", NORM: "julij"},
+ {ORTH: "jun.", NORM: "junij"},
+ {ORTH: "kit.", NORM: "kitajsko"},
+ {ORTH: "knj.", NORM: "knjižno"},
+ {ORTH: "knjiž.", NORM: "knjižno"},
+ {ORTH: "kor.", NORM: "koreografija"},
+ {ORTH: "lat.", NORM: "latinski"},
+ {ORTH: "les.", NORM: "lesna stroka"},
+ {ORTH: "lingv.", NORM: "lingvistika"},
+ {ORTH: "lit.", NORM: "literarni"},
+ {ORTH: "ljubk.", NORM: "ljubkovalno"},
+ {ORTH: "lov.", NORM: "lovstvo"},
+ {ORTH: "m.", NORM: "moški"},
+ {ORTH: "mak.", NORM: "makedonski"},
+ {ORTH: "mar.", NORM: "marec"},
+ {ORTH: "mat.", NORM: "matematika"},
+ {ORTH: "med.", NORM: "medicina"},
+ {ORTH: "meh.", NORM: "mehiško"},
+ {ORTH: "mest.", NORM: "mestnik"},
+ {ORTH: "mdr.", NORM: "med drugim"},
+ {ORTH: "min.", NORM: "mineralogija"},
+ {ORTH: "mitol.", NORM: "mitologija"},
+ {ORTH: "mn.", NORM: "množina"},
+ {ORTH: "mont.", NORM: "montanistika"},
+ {ORTH: "muz.", NORM: "muzikologija"},
+ {ORTH: "nam.", NORM: "namenilnik"},
+ {ORTH: "nar.", NORM: "narečno"},
+ {ORTH: "nav.", NORM: "navadno"},
+ {ORTH: "nedol.", NORM: "nedoločnik"},
+ {ORTH: "nedov.", NORM: "nedovršni"},
+ {ORTH: "neprav.", NORM: "nepravilno"},
+ {ORTH: "nepreh.", NORM: "neprehodno"},
+ {ORTH: "neskl.", NORM: "nesklonljiv(o)"},
+ {ORTH: "nestrok.", NORM: "nestrokovno"},
+ {ORTH: "num.", NORM: "numizmatika"},
+ {ORTH: "npr.", NORM: "na primer"},
+ {ORTH: "obrt.", NORM: "obrtništvo"},
+ {ORTH: "okt.", NORM: "oktober"},
+ {ORTH: "or.", NORM: "orodnik"},
+ {ORTH: "os.", NORM: "oseba"},
+ {ORTH: "otr.", NORM: "otroško"},
+ {ORTH: "oz.", NORM: "oziroma"},
+ {ORTH: "pal.", NORM: "paleontologija"},
+ {ORTH: "papir.", NORM: "papirništvo"},
+ {ORTH: "ped.", NORM: "pedagogika"},
+ {ORTH: "pisar.", NORM: "pisarniško"},
+ {ORTH: "pog.", NORM: "pogovorno"},
+ {ORTH: "polit.", NORM: "politika"},
+ {ORTH: "polj.", NORM: "poljsko"},
+ {ORTH: "poljud.", NORM: "poljudno"},
+ {ORTH: "preg.", NORM: "pregovor"},
+ {ORTH: "preh.", NORM: "prehodno"},
+ {ORTH: "pren.", NORM: "preneseno"},
+ {ORTH: "prid.", NORM: "pridevnik"},
+ {ORTH: "prim.", NORM: "primerjaj"},
+ {ORTH: "prisl.", NORM: "prislov"},
+ {ORTH: "psih.", NORM: "psihologija"},
+ {ORTH: "psiht.", NORM: "psihiatrija"},
+ {ORTH: "rad.", NORM: "radiotehnika"},
+ {ORTH: "rač.", NORM: "računalništvo"},
+ {ORTH: "rib.", NORM: "ribištvo"},
+ {ORTH: "rod.", NORM: "rodilnik"},
+ {ORTH: "rus.", NORM: "rusko"},
+ {ORTH: "s.", NORM: "srednji"},
+ {ORTH: "sam.", NORM: "samostalniški"},
+ {ORTH: "sed.", NORM: "sedanjik"},
+ {ORTH: "sep.", NORM: "september"},
+ {ORTH: "slabš.", NORM: "slabšalno"},
+ {ORTH: "slovan.", NORM: "slovansko"},
+ {ORTH: "slovaš.", NORM: "slovaško"},
+ {ORTH: "srb.", NORM: "srbsko"},
+ {ORTH: "star.", NORM: "starinsko"},
+ {ORTH: "stil.", NORM: "stilno"},
+ {ORTH: "sv.", NORM: "svet(i)"},
+ {ORTH: "teh.", NORM: "tehnika"},
+ {ORTH: "tisk.", NORM: "tiskarstvo"},
+ {ORTH: "tj.", NORM: "to je"},
+ {ORTH: "tož.", NORM: "tožilnik"},
+ {ORTH: "trg.", NORM: "trgovina"},
+ {ORTH: "ukr.", NORM: "ukrajinski"},
+ {ORTH: "um.", NORM: "umetnost"},
+ {ORTH: "vel.", NORM: "velelnik"},
+ {ORTH: "vet.", NORM: "veterina"},
+ {ORTH: "vez.", NORM: "veznik"},
+ {ORTH: "vn.", NORM: "visokonemško"},
+ {ORTH: "voj.", NORM: "vojska"},
+ {ORTH: "vrtn.", NORM: "vrtnarstvo"},
+ {ORTH: "vulg.", NORM: "vulgarno"},
+ {ORTH: "vznes.", NORM: "vzneseno"},
+ {ORTH: "zal.", NORM: "založništvo"},
+ {ORTH: "zastar.", NORM: "zastarelo"},
+ {ORTH: "zgod.", NORM: "zgodovina"},
+ {ORTH: "zool.", NORM: "zoologija"},
+ {ORTH: "čeb.", NORM: "čebelarstvo"},
+ {ORTH: "češ.", NORM: "češki"},
+ {ORTH: "člov.", NORM: "človeškost"},
+ {ORTH: "šah.", NORM: "šahovski"},
+ {ORTH: "šalj.", NORM: "šaljivo"},
+ {ORTH: "šp.", NORM: "španski"},
+ {ORTH: "špan.", NORM: "špansko"},
+ {ORTH: "šport.", NORM: "športni"},
+ {ORTH: "štev.", NORM: "števnik"},
+ {ORTH: "šved.", NORM: "švedsko"},
+ {ORTH: "švic.", NORM: "švicarsko"},
+ {ORTH: "ž.", NORM: "ženski"},
+ {ORTH: "žarg.", NORM: "žargonsko"},
+ {ORTH: "žel.", NORM: "železnica"},
+ {ORTH: "živ.", NORM: "živost"},
+]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+
+abbrv = """
+Co. Ch. DIPL. DR. Dr. Ev. Inc. Jr. Kr. Mag. M. MR. Mr. Mt. Murr. Npr. OZ.
+Opr. Osn. Prim. Roj. ST. Sim. Sp. Sred. St. Sv. Škofl. Tel. UR. Zb.
+a. aa. ab. abc. abit. abl. abs. abt. acc. accel. add. adj. adv. aet. afr. akad. al. alban. all. alleg.
+alp. alt. alter. alžir. am. an. andr. ang. anh. anon. ans. antrop. apoc. app. approx. apt. ar. arc. arch.
+arh. arr. as. asist. assist. assoc. asst. astr. attn. aug. avstral. az. b. bab. bal. bbl. bd. belg. bioinf.
+biomed. bk. bl. bn. borg. bp. br. braz. brit. bros. broš. bt. bu. c. ca. cal. can. cand. cantab. cap. capt.
+cat. cath. cc. cca. cd. cdr. cdre. cent. cerkv. cert. cf. cfr. ch. chap. chem. chr. chs. cic. circ. civ. cl.
+cm. cmd. cnr. co. cod. col. coll. colo. com. comp. con. conc. cond. conn. cons. cont. coop. corr. cost. cp.
+cpl. cr. crd. cres. cresc. ct. cu. d. dan. dat. davč. ddr. dec. ded. def. dem. dent. dept. dia. dip. dipl.
+dir. disp. diss. div. do. doc. dok. dol. doo. dop. dott. dr. dram. druž. družb. drž. dt. duh. dur. dvr. dwt. e.
+ea. ecc. eccl. eccles. econ. edn. egipt. egr. ekon. eksp. el. em. enc. eng. eo. ep. err. esp. esq. est.
+et. etc. etnogr. etnol. ev. evfem. evr. ex. exc. excl. exp. expl. ext. exx. f. fa. facs. fak. faks. fas.
+fasc. fco. fcp. feb. febr. fec. fed. fem. ff. fff. fid. fig. fil. film. fiziol. fiziot. flam. fm. fo. fol. folk.
+frag. fran. franc. fsc. g. ga. gal. gdč. ge. gen. geod. geog. geotehnol. gg. gimn. glas. glav. gnr. go. gor.
+gosp. gp. graf. gram. gren. grš. gs. h. hab. hf. hist. ho. hort. i. ia. ib. ibid. id. idr. idridr. ill. imen.
+imp. impf. impr. in. inc. incl. ind. indus. inf. inform. ing. init. ins. int. inv. inšp. inštr. inž. is. islam.
+ist. ital. iur. iz. izbr. izd. izg. izgr. izr. izv. j. jak. jam. jan. jav. je. jez. jr. jsl. jud. jug.
+jugoslovan. jur. juž. jv. jz. k. kal. kan. kand. kat. kdo. kem. kip. kmet. kol. kom. komp. konf. kont. kost. kov.
+kp. kpfw. kr. kraj. krat. kub. kult. kv. kval. l. la. lab. lb. ld. let. lib. lik. litt. lj. ljud. ll. loc. log.
+loč. lt. ma. madž. mag. manag. manjš. masc. mass. mater. max. maxmax. mb. md. mech. medic. medij. medn.
+mehč. mem. menedž. mes. mess. metal. meteor. meteorol. mex. mi. mikr. mil. minn. mio. misc. miss. mit. mk.
+mkt. ml. mlad. mlle. mlr. mm. mme. množ. mo. moj. moš. možn. mr. mrd. mrs. ms. msc. msgr. mt. murr. mus. mut.
+n. na. nad. nadalj. nadom. nagl. nakl. namer. nan. naniz. nasl. nat. navt. nač. ned. nem. nik. nizoz. nm. nn.
+no. nom. norv. notr. nov. novogr. ns. o. ob. obd. obj. oblač. obl. oblik. obr. obraz. obs. obst. obt. obč. oc.
+oct. od. odd. odg. odn. odst. odv. oec. off. ok. okla. okr. ont. oo. op. opis. opp. opr. orch. ord. ore. oreg.
+org. orient. orig. ork. ort. oseb. osn. ot. ozir. ošk. p. pag. par. para. parc. parl. part. past. pat. pdk.
+pen. perf. pert. perz. pesn. pet. pev. pf. pfc. ph. pharm. phil. pis. pl. po. pod. podr. podaljš. pogl. pogoj. pojm.
+pok. pokr. pol. poljed. poljub. polu. pom. pomen. pon. ponov. pop. por. port. pos. posl. posn. pov. pp. ppl. pr.
+praet. prav. pravopis. pravosl. preb. pred. predl. predm. predp. preds. pref. pregib. prel. prem. premen. prep.
+pres. pret. prev. pribl. prih. pril. primerj. primor. prip. pripor. prir. prist. priv. proc. prof. prog. proiz.
+prom. pron. prop. prot. protest. prov. ps. pss. pt. publ. pz. q. qld. qu. quad. que. r. racc. rastl. razgl.
+razl. razv. rd. red. ref. reg. rel. relig. rep. repr. rer. resp. rest. ret. rev. revol. rež. rim. rist. rkp. rm.
+roj. rom. romun. rp. rr. rt. rud. ruš. ry. sal. samogl. san. sc. scen. sci. scr. sdv. seg. sek. sen. sept. ser.
+sev. sg. sgt. sh. sig. sigg. sign. sim. sin. sing. sinh. skand. skl. sklad. sklanj. sklep. skr. sl. slik. slov.
+slovak. slovn. sn. so. sob. soc. sociol. sod. sopomen. sopr. sor. sov. sovj. sp. spec. spl. spr. spreg. sq. sr.
+sre. sred. sredoz. srh. ss. ssp. st. sta. stan. stanstar. stcsl. ste. stim. stol. stom. str. stroj. strok. stsl.
+stud. sup. supl. suppl. svet. sz. t. tab. tech. ted. tehn. tehnol. tek. teks. tekst. tel. temp. ten. teol. ter.
+term. test. th. theol. tim. tip. tisočl. tit. tl. tol. tolmač. tom. tor. tov. tr. trad. traj. trans. tren.
+trib. tril. trop. trp. trž. ts. tt. tu. tur. turiz. tvor. tvorb. tč. u. ul. umet. un. univ. up. upr. ur. urad.
+us. ust. utr. v. va. val. var. varn. ven. ver. verb. vest. vezal. vic. vis. viv. viz. viš. vod. vok. vol. vpr.
+vrst. vrstil. vs. vv. vzd. vzg. vzh. vzor. w. wed. wg. wk. x. y. z. zah. zaim. zak. zap. zasl. zavar. zač. zb.
+združ. zg. zn. znan. znanstv. zoot. zun. zv. zvd. á. é. ć. č. čas. čet. čl. člen. čustv. đ. ľ. ł. ş. ŠT. š. šir.
+škofl. škot. šol. št. števil. štud. ů. ű. žen. žival.
+""".split()
+
+for orth in abbrv:
+ _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 737243b66..bfea9ff69 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -29,7 +29,7 @@ class Ukrainian(Language):
assigns=["token.lemma"],
default_config={
"model": None,
- "mode": "pymorphy2",
+ "mode": "pymorphy3",
"overwrite": False,
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
},
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index a8bc56057..8337e7328 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -14,11 +14,11 @@ class UkrainianLemmatizer(RussianLemmatizer):
model: Optional[Model],
name: str = "lemmatizer",
*,
- mode: str = "pymorphy2",
+ mode: str = "pymorphy3",
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
- if mode == "pymorphy2":
+ if mode in {"pymorphy2", "pymorphy2_lookup"}:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
@@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
) from None
if getattr(self, "_morph", None) is None:
self._morph = MorphAnalyzer(lang="uk")
+ elif mode == "pymorphy3":
+ try:
+ from pymorphy3 import MorphAnalyzer
+ except ImportError:
+ raise ImportError(
+ "The Ukrainian lemmatizer mode 'pymorphy3' requires the "
+ "pymorphy3 library and dictionaries. Install them with: "
+ "pip install pymorphy3 pymorphy3-dicts-uk"
+ ) from None
+ if getattr(self, "_morph", None) is None:
+ self._morph = MorphAnalyzer(lang="uk")
super().__init__(
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
)
diff --git a/spacy/language.py b/spacy/language.py
index 34a06e576..d391f15ab 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,4 +1,4 @@
-from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Collection
+from typing import Iterator, Optional, Any, Dict, Callable, Iterable
from typing import Union, Tuple, List, Set, Pattern, Sequence
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
@@ -10,6 +10,7 @@ from contextlib import contextmanager
from copy import deepcopy
from pathlib import Path
import warnings
+
from thinc.api import get_current_ops, Config, CupyOps, Optimizer
import srsly
import multiprocessing as mp
@@ -24,7 +25,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer
-from .util import registry, SimpleFrozenList, _pipe, raise_error
+from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .util import warn_if_jupyter_cupy
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
@@ -1698,9 +1699,9 @@ class Language:
config: Union[Dict[str, Any], Config] = {},
*,
vocab: Union[Vocab, bool] = True,
- disable: Union[str, Iterable[str]] = SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True,
validate: bool = True,
@@ -1727,12 +1728,6 @@ class Language:
DOCS: https://spacy.io/api/language#from_config
"""
- if isinstance(disable, str):
- disable = [disable]
- if isinstance(enable, str):
- enable = [enable]
- if isinstance(exclude, str):
- exclude = [exclude]
if auto_fill:
config = Config(
cls.default_config, section_order=CONFIG_SECTION_ORDER
@@ -1877,9 +1872,38 @@ class Language:
nlp.vocab.from_bytes(vocab_b)
# Resolve disabled/enabled settings.
+ if isinstance(disable, str):
+ disable = [disable]
+ if isinstance(enable, str):
+ enable = [enable]
+ if isinstance(exclude, str):
+ exclude = [exclude]
+
+ def fetch_pipes_status(value: Iterable[str], key: str) -> Iterable[str]:
+ """Fetch value for `enable` or `disable` w.r.t. the specified config and passed arguments passed to
+ .load(). If both arguments and config specified values for this field, the passed arguments take precedence
+ and a warning is printed.
+ value (Iterable[str]): Passed value for `enable` or `disable`.
+ key (str): Key for field in config (either "enabled" or "disabled").
+ RETURN (Iterable[str]):
+ """
+ # We assume that no argument was passed if the value is the specified default value.
+ if id(value) == id(_DEFAULT_EMPTY_PIPES):
+ return config["nlp"].get(key, [])
+ else:
+ if len(config["nlp"].get(key, [])):
+ warnings.warn(
+ Warnings.W123.format(
+ arg=key[:-1],
+ arg_value=value,
+ config_value=config["nlp"][key],
+ )
+ )
+ return value
+
disabled_pipes = cls._resolve_component_status(
- [*config["nlp"]["disabled"], *disable],
- [*config["nlp"].get("enabled", []), *enable],
+ fetch_pipes_status(disable, "disabled"),
+ fetch_pipes_status(enable, "enabled"),
config["nlp"]["pipeline"],
)
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
@@ -2064,14 +2088,7 @@ class Language:
pipe_name for pipe_name in pipe_names if pipe_name not in enable
]
if disable and disable != to_disable:
- raise ValueError(
- Errors.E1042.format(
- arg1="enable",
- arg2="disable",
- arg1_values=enable,
- arg2_values=disable,
- )
- )
+ raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
return tuple(to_disable)
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index d847342a3..4d18d216a 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -1,11 +1,12 @@
from pathlib import Path
from typing import Optional, Callable, Iterable, List, Tuple
from thinc.types import Floats2d
-from thinc.api import chain, clone, list2ragged, reduce_mean, residual
-from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
+from thinc.api import chain, list2ragged, reduce_mean, residual
+from thinc.api import Model, Maxout, Linear, tuplify, Ragged
from ...util import registry
-from ...kb import KnowledgeBase, Candidate, get_candidates
+from ...kb import KnowledgeBase, InMemoryLookupKB
+from ...kb import Candidate, get_candidates, get_candidates_batch
from ...vocab import Vocab
from ...tokens import Span, Doc
from ..extract_spans import extract_spans
@@ -78,9 +79,11 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
@registry.misc("spacy.KBFromFile.v1")
-def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
- def kb_from_file(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=1)
+def load_kb(
+ kb_path: Path,
+) -> Callable[[Vocab], KnowledgeBase]:
+ def kb_from_file(vocab: Vocab):
+ kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.from_disk(kb_path)
return kb
@@ -88,9 +91,11 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.EmptyKB.v1")
-def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
- def empty_kb_factory(vocab):
- return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
+def empty_kb(
+ entity_vector_length: int,
+) -> Callable[[Vocab], KnowledgeBase]:
+ def empty_kb_factory(vocab: Vocab):
+ return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
return empty_kb_factory
@@ -98,3 +103,10 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
@registry.misc("spacy.CandidateGenerator.v1")
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
return get_candidates
+
+
+@registry.misc("spacy.CandidateBatchGenerator.v1")
+def create_candidates_batch() -> Callable[
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+]:
+ return get_candidates_batch
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index b7d615f6d..12f9b73a3 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -1,7 +1,6 @@
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
-from typing import Sequence, Tuple, Union
+from typing import Tuple
from collections import Counter
-from copy import deepcopy
from itertools import islice
import numpy as np
@@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
n_labels = len(self.cfg["labels"])
- guesses: List[Ints2d] = [
- self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
- ]
+ guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
assert len(guesses) == n_docs
return guesses
scores = self.model.predict(docs)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 73a90b268..62845287b 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -53,9 +53,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
"incl_context": True,
"entity_vector_length": 64,
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
+ "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
"overwrite": True,
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
"use_gold_ents": True,
+ "candidates_batch_size": 1,
"threshold": None,
},
default_score_weights={
@@ -75,9 +77,13 @@ def make_entity_linker(
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+ get_candidates_batch: Callable[
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+ ],
overwrite: bool,
scorer: Optional[Callable],
use_gold_ents: bool,
+ candidates_batch_size: int,
threshold: Optional[float] = None,
):
"""Construct an EntityLinker component.
@@ -90,17 +96,21 @@ def make_entity_linker(
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
- get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
+ get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
+ get_candidates_batch (
+ Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+ ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
+ candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
prediction is discarded. If None, predictions are not filtered by any threshold.
"""
if not model.attrs.get("include_span_maker", False):
- # The only difference in arguments here is that use_gold_ents is not available
+ # The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
model,
@@ -124,9 +134,11 @@ def make_entity_linker(
incl_context=incl_context,
entity_vector_length=entity_vector_length,
get_candidates=get_candidates,
+ get_candidates_batch=get_candidates_batch,
overwrite=overwrite,
scorer=scorer,
use_gold_ents=use_gold_ents,
+ candidates_batch_size=candidates_batch_size,
threshold=threshold,
)
@@ -160,9 +172,13 @@ class EntityLinker(TrainablePipe):
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
+ get_candidates_batch: Callable[
+ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+ ],
overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
use_gold_ents: bool,
+ candidates_batch_size: int,
threshold: Optional[float] = None,
) -> None:
"""Initialize an entity linker.
@@ -178,10 +194,14 @@ class EntityLinker(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
- scorer (Optional[Callable]): The scoring method. Defaults to
- Scorer.score_links.
+ get_candidates_batch (
+ Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+ Iterable[Candidate]]
+ ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
+ scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
component must provide entity annotations.
+ candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the
threshold, prediction is discarded. If None, predictions are not filtered by any threshold.
DOCS: https://spacy.io/api/entitylinker#init
@@ -204,22 +224,27 @@ class EntityLinker(TrainablePipe):
self.incl_prior = incl_prior
self.incl_context = incl_context
self.get_candidates = get_candidates
+ self.get_candidates_batch = get_candidates_batch
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account
- # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
+ # create an empty KB by default
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
self.use_gold_ents = use_gold_ents
+ self.candidates_batch_size = candidates_batch_size
self.threshold = threshold
+ if candidates_batch_size < 1:
+ raise ValueError(Errors.E1044)
+
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
create it using this object's vocab."""
if not callable(kb_loader):
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
- self.kb = kb_loader(self.vocab)
+ self.kb = kb_loader(self.vocab) # type: ignore
def validate_kb(self) -> None:
# Raise an error if the knowledge base is not initialized.
@@ -241,8 +266,8 @@ class EntityLinker(TrainablePipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
- kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
- Note that providing this argument, will overwrite all data accumulated in the current KB.
+ kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab
+ instance. Note that providing this argument will overwrite all data accumulated in the current KB.
Use this only when loading a KB as-such from file.
DOCS: https://spacy.io/api/entitylinker#initialize
@@ -419,66 +444,93 @@ class EntityLinker(TrainablePipe):
if len(doc) == 0:
continue
sentences = [s for s in doc.sents]
- # Looping through each entity (TODO: rewrite)
- for ent in doc.ents:
- sent_index = sentences.index(ent.sent)
- assert sent_index >= 0
- if self.incl_context:
- # get n_neighbour sentences, clipped to the length of the document
- start_sentence = max(0, sent_index - self.n_sents)
- end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
- start_token = sentences[start_sentence].start
- end_token = sentences[end_sentence].end
- sent_doc = doc[start_token:end_token].as_doc()
- # currently, the context is the same for each entity in a sentence (should be refined)
- sentence_encoding = self.model.predict([sent_doc])[0]
- sentence_encoding_t = sentence_encoding.T
- sentence_norm = xp.linalg.norm(sentence_encoding_t)
- entity_count += 1
- if ent.label_ in self.labels_discard:
- # ignoring this entity - setting to NIL
- final_kb_ids.append(self.NIL)
- else:
- candidates = list(self.get_candidates(self.kb, ent))
- if not candidates:
- # no prediction possible for this entity - setting to NIL
- final_kb_ids.append(self.NIL)
- elif len(candidates) == 1 and self.threshold is None:
- # shortcut for efficiency reasons: take the 1 candidate
- final_kb_ids.append(candidates[0].entity_)
- else:
- random.shuffle(candidates)
- # set all prior probabilities to 0 if incl_prior=False
- prior_probs = xp.asarray([c.prior_prob for c in candidates])
- if not self.incl_prior:
- prior_probs = xp.asarray([0.0 for _ in candidates])
- scores = prior_probs
- # add in similarity from the context
- if self.incl_context:
- entity_encodings = xp.asarray(
- [c.entity_vector for c in candidates]
- )
- entity_norm = xp.linalg.norm(entity_encodings, axis=1)
- if len(entity_encodings) != len(prior_probs):
- raise RuntimeError(
- Errors.E147.format(
- method="predict",
- msg="vectors not of equal length",
- )
- )
- # cosine similarity
- sims = xp.dot(entity_encodings, sentence_encoding_t) / (
- sentence_norm * entity_norm
- )
- if sims.shape != prior_probs.shape:
- raise ValueError(Errors.E161)
- scores = prior_probs + sims - (prior_probs * sims)
- final_kb_ids.append(
- candidates[scores.argmax().item()].entity_
- if self.threshold is None or scores.max() >= self.threshold
- else EntityLinker.NIL
+ # Loop over entities in batches.
+ for ent_idx in range(0, len(doc.ents), self.candidates_batch_size):
+ ent_batch = doc.ents[ent_idx : ent_idx + self.candidates_batch_size]
+
+ # Look up candidate entities.
+ valid_ent_idx = [
+ idx
+ for idx in range(len(ent_batch))
+ if ent_batch[idx].label_ not in self.labels_discard
+ ]
+
+ batch_candidates = list(
+ self.get_candidates_batch(
+ self.kb, [ent_batch[idx] for idx in valid_ent_idx]
+ )
+ if self.candidates_batch_size > 1
+ else [
+ self.get_candidates(self.kb, ent_batch[idx])
+ for idx in valid_ent_idx
+ ]
+ )
+
+ # Looping through each entity in batch (TODO: rewrite)
+ for j, ent in enumerate(ent_batch):
+ sent_index = sentences.index(ent.sent)
+ assert sent_index >= 0
+
+ if self.incl_context:
+ # get n_neighbour sentences, clipped to the length of the document
+ start_sentence = max(0, sent_index - self.n_sents)
+ end_sentence = min(
+ len(sentences) - 1, sent_index + self.n_sents
)
+ start_token = sentences[start_sentence].start
+ end_token = sentences[end_sentence].end
+ sent_doc = doc[start_token:end_token].as_doc()
+ # currently, the context is the same for each entity in a sentence (should be refined)
+ sentence_encoding = self.model.predict([sent_doc])[0]
+ sentence_encoding_t = sentence_encoding.T
+ sentence_norm = xp.linalg.norm(sentence_encoding_t)
+ entity_count += 1
+ if ent.label_ in self.labels_discard:
+ # ignoring this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ else:
+ candidates = list(batch_candidates[j])
+ if not candidates:
+ # no prediction possible for this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ elif len(candidates) == 1 and self.threshold is None:
+ # shortcut for efficiency reasons: take the 1 candidate
+ final_kb_ids.append(candidates[0].entity_)
+ else:
+ random.shuffle(candidates)
+ # set all prior probabilities to 0 if incl_prior=False
+ prior_probs = xp.asarray([c.prior_prob for c in candidates])
+ if not self.incl_prior:
+ prior_probs = xp.asarray([0.0 for _ in candidates])
+ scores = prior_probs
+ # add in similarity from the context
+ if self.incl_context:
+ entity_encodings = xp.asarray(
+ [c.entity_vector for c in candidates]
+ )
+ entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+ if len(entity_encodings) != len(prior_probs):
+ raise RuntimeError(
+ Errors.E147.format(
+ method="predict",
+ msg="vectors not of equal length",
+ )
+ )
+ # cosine similarity
+ sims = xp.dot(entity_encodings, sentence_encoding_t) / (
+ sentence_norm * entity_norm
+ )
+ if sims.shape != prior_probs.shape:
+ raise ValueError(Errors.E161)
+ scores = prior_probs + sims - (prior_probs * sims)
+ final_kb_ids.append(
+ candidates[scores.argmax().item()].entity_
+ if self.threshold is None
+ or scores.max() >= self.threshold
+ else EntityLinker.NIL
+ )
+
if not (len(final_kb_ids) == entity_count):
err = Errors.E147.format(
method="predict", msg="result variables not of equal length"
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 3cb1ca676..8154a077d 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,6 +1,5 @@
-import warnings
from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
-from typing import cast
+import warnings
from collections import defaultdict
from pathlib import Path
import srsly
@@ -317,7 +316,7 @@ class EntityRuler(Pipe):
phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern)
for entry in token_patterns + phrase_patterns: # type: ignore[operator]
- label = entry["label"]
+ label = entry["label"] # type: ignore
if "id" in entry:
ent_label = label
label = self._create_label(label, entry["id"])
diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py
index 2f8a1f8ea..c14dfa1db 100644
--- a/spacy/pipeline/legacy/entity_linker.py
+++ b/spacy/pipeline/legacy/entity_linker.py
@@ -68,8 +68,7 @@ class EntityLinker_v1(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
- scorer (Optional[Callable]): The scoring method. Defaults to
- Scorer.score_links.
+ scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
"""
self.vocab = vocab
@@ -115,7 +114,7 @@ class EntityLinker_v1(TrainablePipe):
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
- kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance.
+ kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
Note that providing this argument, will overwrite all data accumulated in the current KB.
Use this only when loading a KB as-such from file.
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 1b7a9eecb..956bbb72c 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -26,17 +26,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
hidden_size = 128
[model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
width = 96
rows = [5000, 2000, 1000, 1000]
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
@@ -133,6 +133,9 @@ def make_spancat(
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
+ scorer (Optional[Callable]): The scoring method. Defaults to
+ Scorer.score_spans for the Doc.spans[spans_key] with overlapping
+ spans allowed.
threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to
0.5.
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index c45f819fc..59549ad99 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -24,8 +24,8 @@ single_label_default_config = """
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index e33a885f8..eb83d9cb7 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -19,17 +19,17 @@ multi_label_default_config = """
@architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec]
-@architectures = "spacy.Tok2Vec.v1"
+@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
-rows = [2000, 2000, 1000, 1000, 1000, 1000]
-attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
+rows = [2000, 2000, 500, 1000, 500]
+attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v1"
+@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
@@ -96,8 +96,8 @@ def make_multilabel_textcat(
model: Model[List[Doc], List[Floats2d]],
threshold: float,
scorer: Optional[Callable],
-) -> "TextCategorizer":
- """Create a TextCategorizer component. The text categorizer predicts categories
+) -> "MultiLabel_TextCategorizer":
+ """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels
per doc).
@@ -105,6 +105,7 @@ def make_multilabel_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
+ scorer (Optional[Callable]): The scoring method.
"""
return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer
@@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
+ scorer (Optional[Callable]): The scoring method.
DOCS: https://spacy.io/api/textcategorizer#init
"""
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 2e3dde3cb..c742aaeaa 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
width = self.model.get_dim("nO")
return [self.model.ops.alloc((0, width)) for doc in docs]
tokvecs = self.model.predict(docs)
- batch_id = Tok2VecListener.get_batch_id(docs)
- for listener in self.listeners:
- listener.receive(batch_id, tokvecs, _empty_backprop)
return tokvecs
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
@@ -286,8 +283,19 @@ class Tok2VecListener(Model):
def forward(model: Tok2VecListener, inputs, is_train: bool):
"""Supply the outputs from the upstream Tok2Vec component."""
if is_train:
- model.verify_inputs(inputs)
- return model._outputs, model._backprop
+ # This might occur during training when the tok2vec layer is frozen / hasn't been updated.
+ # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
+ if model._batch_id is None:
+ outputs = []
+ for doc in inputs:
+ if doc.tensor.size == 0:
+ raise ValueError(Errors.E203.format(name="tok2vec"))
+ else:
+ outputs.append(doc.tensor)
+ return outputs, _empty_backprop
+ else:
+ model.verify_inputs(inputs)
+ return model._outputs, model._backprop
else:
# This is pretty grim, but it's hard to do better :(.
# It's hard to avoid relying on the doc.tensor attribute, because the
@@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
outputs.append(model.ops.alloc2f(len(doc), width))
else:
outputs.append(doc.tensor)
- return outputs, lambda dX: []
+ return outputs, _empty_backprop
def _empty_backprop(dX): # for pickling
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 048082134..c824d76b9 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
- EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
- NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
- GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
- LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
- GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
- LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
+ EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
+ NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
+ GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
+ LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
+ GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
+ LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
class Config:
extra = "forbid"
@@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
# fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset")
- checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+ checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset")
# fmt: on
@@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
class ProjectConfigAssetGit(BaseModel):
# fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
- checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
+ checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on
@@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
None, title="Indices of sentences' start and end indices"
)
text: StrictStr = Field(..., title="Document text")
- spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
- None, title="Span information - end/start indices, label, KB ID"
- )
+ spans: Optional[
+ Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
+ ] = Field(None, title="Span information - end/start indices, label, KB ID")
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
..., title="Token information - ID, start, annotations"
)
@@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
title="Any custom data stored in the document's _ attribute",
alias="_",
)
- underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+ underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the token's _ attribute"
)
- underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
+ underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the span's _ attribute"
)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 742bfcc6a..0fc74243d 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -333,16 +333,24 @@ def ro_tokenizer():
@pytest.fixture(scope="session")
def ru_tokenizer():
- pytest.importorskip("pymorphy2")
+ pytest.importorskip("pymorphy3")
return get_lang_class("ru")().tokenizer
@pytest.fixture
def ru_lemmatizer():
- pytest.importorskip("pymorphy2")
+ pytest.importorskip("pymorphy3")
return get_lang_class("ru")().add_pipe("lemmatizer")
+@pytest.fixture
+def ru_lookup_lemmatizer():
+ pytest.importorskip("pymorphy2")
+ return get_lang_class("ru")().add_pipe(
+ "lemmatizer", config={"mode": "pymorphy2_lookup"}
+ )
+
+
@pytest.fixture(scope="session")
def sa_tokenizer():
return get_lang_class("sa")().tokenizer
@@ -411,15 +419,24 @@ def ky_tokenizer():
@pytest.fixture(scope="session")
def uk_tokenizer():
- pytest.importorskip("pymorphy2")
+ pytest.importorskip("pymorphy3")
return get_lang_class("uk")().tokenizer
@pytest.fixture
def uk_lemmatizer():
+ pytest.importorskip("pymorphy3")
+ pytest.importorskip("pymorphy3_dicts_uk")
+ return get_lang_class("uk")().add_pipe("lemmatizer")
+
+
+@pytest.fixture
+def uk_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2_dicts_uk")
- return get_lang_class("uk")().add_pipe("lemmatizer")
+ return get_lang_class("uk")().add_pipe(
+ "lemmatizer", config={"mode": "pymorphy2_lookup"}
+ )
@pytest.fixture(scope="session")
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index a64ab2ba8..38003dea9 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -82,6 +82,21 @@ def test_issue2396(en_vocab):
assert (span.get_lca_matrix() == matrix).all()
+@pytest.mark.issue(11499)
+def test_init_args_unmodified(en_vocab):
+ words = ["A", "sentence"]
+ ents = ["B-TYPE1", ""]
+ sent_starts = [True, False]
+ Doc(
+ vocab=en_vocab,
+ words=words,
+ ents=ents,
+ sent_starts=sent_starts,
+ )
+ assert ents == ["B-TYPE1", ""]
+ assert sent_starts == [True, False]
+
+
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
@pytest.mark.parametrize("lang_cls", [English, MultiLanguage])
@pytest.mark.issue(2782)
diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py
index 0d7c061c9..19698cfb2 100644
--- a/spacy/tests/doc/test_json_doc_conversion.py
+++ b/spacy/tests/doc/test_json_doc_conversion.py
@@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute"
+ doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117
+ doc[1]._.token_test = 118
doc.spans["span_group"] = [doc[0:1]]
json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"]
@@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
assert json_doc["_"]["json_test2"] == [1, 2, 3]
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
- assert json_doc["underscore_token"]["token_test"]["value"] == 117
- assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+ assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+ assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
+ assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+ assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
assert json_doc["_"]["json_test"] == "hello world"
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
- assert json_doc["underscore_token"]["token_test"]["value"] == 117
- assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
+ assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
+ assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
assert json_doc["_"]["my_ext"] == "hello world"
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
- assert json_doc["underscore_token"]["my_ext"]["value"] == 117
- assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
+ assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
+ assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
doc[0]._.token_test = 117
json_doc = doc.to_json(underscore=["span_test"])
- assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
- assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
- assert "token_test" not in json_doc["underscore_token"]
+ assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
+ assert "underscore_token" not in json_doc
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
@@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute"
+ doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117
+ doc[1]._.token_test = 118
json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"]
@@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
assert new_doc._.json_test1 == "hello world"
assert new_doc._.json_test2 == [1, 2, 3]
assert new_doc[0]._.token_test == 117
+ assert new_doc[1]._.token_test == 118
assert new_doc[0:1]._.span_test == "span_attribute"
+ assert new_doc[0:2]._.span_test == "span_attribute_2"
assert new_doc.user_data == doc.user_data
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
exclude=["user_data"]
diff --git a/spacy/tests/lang/grc/test_tokenizer.py b/spacy/tests/lang/grc/test_tokenizer.py
new file mode 100644
index 000000000..3df5b546b
--- /dev/null
+++ b/spacy/tests/lang/grc/test_tokenizer.py
@@ -0,0 +1,18 @@
+import pytest
+
+
+# fmt: off
+GRC_TOKEN_EXCEPTION_TESTS = [
+ ("τὸ 〈τῆς〉 φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ ⟦βαρβάρων⟧ ἄρξαι.", ["τὸ", "〈", "τῆς", "〉", "φιλοσοφίας", "ἔργον", "ἔνιοί", "φασιν", "ἀπὸ", "⟦", "βαρβάρων", "⟧", "ἄρξαι", "."]),
+ ("τὴν δὲ τῶν Αἰγυπτίων φιλοσοφίαν εἶναι τοιαύτην περί τε †θεῶν† καὶ ὑπὲρ δικαιοσύνης.", ["τὴν", "δὲ", "τῶν", "Αἰγυπτίων", "φιλοσοφίαν", "εἶναι", "τοιαύτην", "περί", "τε", "†", "θεῶν", "†", "καὶ", "ὑπὲρ", "δικαιοσύνης", "."]),
+ ("⸏πόσις δ' Ἐρεχθεύς ἐστί μοι σεσωσμένος⸏", ["⸏", "πόσις", "δ'", "Ἐρεχθεύς", "ἐστί", "μοι", "σεσωσμένος", "⸏"]),
+ ("⸏ὔπνον ἴδωμεν⸎", ["⸏", "ὔπνον", "ἴδωμεν", "⸎"]),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", GRC_TOKEN_EXCEPTION_TESTS)
+def test_grc_tokenizer(grc_tokenizer, text, expected_tokens):
+ tokens = grc_tokenizer(text)
+ token_list = [token.text for token in tokens if not token.is_space]
+ assert expected_tokens == token_list
diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py
index 9ca7f441b..e82fd4f8c 100644
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
+
+
+def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
+ words = ["мама", "мыла", "раму"]
+ pos = ["NOUN", "VERB", "NOUN"]
+ morphs = [
+ "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
+ "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
+ "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
+ ]
+ doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
+ doc = ru_lookup_lemmatizer(doc)
+ lemmas = [token.lemma_ for token in doc]
+ assert lemmas == ["мама", "мыла", "раму"]
diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py
index ddc5b6b5d..a2a932077 100644
--- a/spacy/tests/lang/sl/test_text.py
+++ b/spacy/tests/lang/sl/test_text.py
@@ -20,7 +20,6 @@ od katerih so te svoboščine odvisne,
assert len(tokens) == 116
-@pytest.mark.xfail
def test_ordinal_number(sl_tokenizer):
text = "10. decembra 1948"
tokens = sl_tokenizer(text)
diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py
index 57dd4198a..788744aa1 100644
--- a/spacy/tests/lang/uk/test_lemmatizer.py
+++ b/spacy/tests/lang/uk/test_lemmatizer.py
@@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
uk_lemmatizer(doc)
+ assert [token.lemma for token in doc]
+
+
+def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
+ """Check that the lookup uk lemmatizer runs."""
+ doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
+ uk_lookup_lemmatizer(doc)
+ assert [token.lemma for token in doc]
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 82bc976bb..4d683acc5 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -6,7 +6,7 @@ from numpy.testing import assert_equal
from spacy import registry, util
from spacy.attrs import ENT_KB_ID
from spacy.compat import pickle
-from spacy.kb import Candidate, KnowledgeBase, get_candidates
+from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
from spacy.lang.en import English
from spacy.ml import load_kb
from spacy.pipeline import EntityLinker
@@ -34,7 +34,7 @@ def assert_almost_equal(a, b):
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
- kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
with pytest.warns(UserWarning):
@@ -51,7 +51,7 @@ def test_issue4674():
dir_path.mkdir()
file_path = dir_path / "kb"
kb.to_disk(str(file_path))
- kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
kb2.from_disk(str(file_path))
assert kb2.get_size_entities() == 1
@@ -59,9 +59,9 @@ def test_issue4674():
@pytest.mark.issue(6730)
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
- from spacy.kb import KnowledgeBase
+ from spacy.kb.kb_in_memory import InMemoryLookupKB
- kb = KnowledgeBase(en_vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
@@ -127,7 +127,7 @@ def test_issue7065_b():
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
alias="No. 8",
@@ -190,7 +190,7 @@ def test_no_entities():
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
return mykb
@@ -231,7 +231,7 @@ def test_partial_links():
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
return mykb
@@ -263,7 +263,7 @@ def test_partial_links():
def test_kb_valid_entities(nlp):
"""Test the valid construction of a KB with 3 entities and two aliases"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3])
@@ -292,7 +292,7 @@ def test_kb_valid_entities(nlp):
def test_kb_invalid_entities(nlp):
"""Test the invalid construction of a KB with an alias linked to a non-existing entity"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@@ -308,7 +308,7 @@ def test_kb_invalid_entities(nlp):
def test_kb_invalid_probabilities(nlp):
"""Test the invalid construction of a KB with wrong prior probabilities"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@@ -322,7 +322,7 @@ def test_kb_invalid_probabilities(nlp):
def test_kb_invalid_combination(nlp):
"""Test the invalid construction of a KB with non-matching entity and probability lists"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
@@ -338,7 +338,7 @@ def test_kb_invalid_combination(nlp):
def test_kb_invalid_entity_vector(nlp):
"""Test the invalid construction of a KB with non-matching entity vector lengths"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3])
@@ -376,7 +376,7 @@ def test_kb_initialize_empty(nlp):
def test_kb_serialize(nlp):
"""Test serialization of the KB"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
# normal read-write behaviour
mykb.to_disk(d / "kb")
@@ -393,12 +393,12 @@ def test_kb_serialize(nlp):
@pytest.mark.issue(9137)
def test_kb_serialize_2(nlp):
v = [5, 6, 7, 8]
- kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb1.set_entities(["E1"], [1], [v])
assert kb1.get_vector("E1") == v
with make_tempdir() as d:
kb1.to_disk(d / "kb")
- kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb2.from_disk(d / "kb")
assert kb2.get_vector("E1") == v
@@ -408,7 +408,7 @@ def test_kb_set_entities(nlp):
v = [5, 6, 7, 8]
v1 = [1, 1, 1, 0]
v2 = [2, 2, 2, 3]
- kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb1 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb1.set_entities(["E0"], [1], [v])
assert kb1.get_entity_strings() == ["E0"]
kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
@@ -417,7 +417,7 @@ def test_kb_set_entities(nlp):
assert kb1.get_vector("E2") == v2
with make_tempdir() as d:
kb1.to_disk(d / "kb")
- kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
+ kb2 = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=4)
kb2.from_disk(d / "kb")
assert set(kb2.get_entity_strings()) == {"E1", "E2"}
assert kb2.get_vector("E1") == v1
@@ -428,7 +428,7 @@ def test_kb_serialize_vocab(nlp):
"""Test serialization of the KB and custom strings"""
entity = "MyFunnyID"
assert entity not in nlp.vocab.strings
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
assert not mykb.contains_entity(entity)
mykb.add_entity(entity, freq=342, entity_vector=[3])
assert mykb.contains_entity(entity)
@@ -436,14 +436,14 @@ def test_kb_serialize_vocab(nlp):
with make_tempdir() as d:
# normal read-write behaviour
mykb.to_disk(d / "kb")
- mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
+ mykb_new = InMemoryLookupKB(Vocab(), entity_vector_length=1)
mykb_new.from_disk(d / "kb")
assert entity in mykb_new.vocab.strings
def test_candidate_generation(nlp):
"""Test correct candidate generation"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
doc = nlp("douglas adam Adam shrubbery")
douglas_ent = doc[0:1]
@@ -481,7 +481,7 @@ def test_el_pipe_configuration(nlp):
ruler.add_patterns([pattern])
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=1)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.add_entity(entity="Q2", freq=12, entity_vector=[2])
kb.add_entity(entity="Q3", freq=5, entity_vector=[3])
kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
@@ -500,10 +500,21 @@ def test_el_pipe_configuration(nlp):
def get_lowercased_candidates(kb, span):
return kb.get_alias_candidates(span.text.lower())
+ def get_lowercased_candidates_batch(kb, spans):
+ return [get_lowercased_candidates(kb, span) for span in spans]
+
@registry.misc("spacy.LowercaseCandidateGenerator.v1")
- def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
+ def create_candidates() -> Callable[
+ [InMemoryLookupKB, "Span"], Iterable[Candidate]
+ ]:
return get_lowercased_candidates
+ @registry.misc("spacy.LowercaseCandidateBatchGenerator.v1")
+ def create_candidates_batch() -> Callable[
+ [InMemoryLookupKB, Iterable["Span"]], Iterable[Iterable[Candidate]]
+ ]:
+ return get_lowercased_candidates_batch
+
# replace the pipe with a new one with with a different candidate generator
entity_linker = nlp.replace_pipe(
"entity_linker",
@@ -511,6 +522,9 @@ def test_el_pipe_configuration(nlp):
config={
"incl_context": False,
"get_candidates": {"@misc": "spacy.LowercaseCandidateGenerator.v1"},
+ "get_candidates_batch": {
+ "@misc": "spacy.LowercaseCandidateBatchGenerator.v1"
+ },
},
)
entity_linker.set_kb(create_kb)
@@ -532,7 +546,7 @@ def test_nel_nsents(nlp):
def test_vocab_serialization(nlp):
"""Test that string information is retained across storage"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@@ -552,7 +566,7 @@ def test_vocab_serialization(nlp):
with make_tempdir() as d:
mykb.to_disk(d / "kb")
- kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
+ kb_new_vocab = InMemoryLookupKB(Vocab(), entity_vector_length=1)
kb_new_vocab.from_disk(d / "kb")
candidates = kb_new_vocab.get_alias_candidates("adam")
@@ -568,7 +582,7 @@ def test_vocab_serialization(nlp):
def test_append_alias(nlp):
"""Test that we can append additional alias-entity pairs"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@@ -599,7 +613,7 @@ def test_append_alias(nlp):
@pytest.mark.filterwarnings("ignore:\\[W036")
def test_append_invalid_alias(nlp):
"""Test that append an alias will throw an error if prior probs are exceeding 1"""
- mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ mykb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
# adding entities
mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
@@ -621,7 +635,7 @@ def test_preserving_links_asdoc(nlp):
vector_length = 1
def create_kb(vocab):
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
# adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
@@ -723,7 +737,7 @@ def test_overfitting_IO():
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
@@ -805,7 +819,7 @@ def test_kb_serialization():
kb_dir = tmp_dir / "kb"
nlp1 = English()
assert "Q2146908" not in nlp1.vocab.strings
- mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(nlp1.vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
assert "Q2146908" in nlp1.vocab.strings
@@ -828,7 +842,7 @@ def test_kb_serialization():
def test_kb_pickle():
# Test that the KB can be pickled
nlp = English()
- kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
assert not kb_1.contains_alias("Russ Cochran")
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
@@ -842,7 +856,7 @@ def test_kb_pickle():
def test_nel_pickle():
# Test that a pipeline with an EL component can be pickled
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=3)
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
return kb
@@ -864,7 +878,7 @@ def test_nel_pickle():
def test_kb_to_bytes():
# Test that the KB's to_bytes method works correctly
nlp = English()
- kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb_1 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
@@ -874,7 +888,7 @@ def test_kb_to_bytes():
)
assert kb_1.contains_alias("Russ Cochran")
kb_bytes = kb_1.to_bytes()
- kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
+ kb_2 = InMemoryLookupKB(nlp.vocab, entity_vector_length=3)
assert not kb_2.contains_alias("Russ Cochran")
kb_2 = kb_2.from_bytes(kb_bytes)
# check that both KBs are exactly the same
@@ -897,7 +911,7 @@ def test_kb_to_bytes():
def test_nel_to_bytes():
# Test that a pipeline with an EL component can be converted to bytes
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=3)
kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8])
return kb
@@ -987,7 +1001,7 @@ def test_legacy_architectures(name, config):
train_examples.append(Example.from_dict(doc, annotation))
def create_kb(vocab):
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
@@ -1054,7 +1068,7 @@ def test_no_gold_ents(patterns):
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9])
# Placeholder
@@ -1104,7 +1118,7 @@ def test_tokenization_mismatch():
def create_kb(vocab):
# create placeholder KB
- mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3])
mykb.add_alias("Kirby", ["Q613241"], [0.9])
return mykb
@@ -1121,6 +1135,12 @@ def test_tokenization_mismatch():
nlp.evaluate(train_examples)
+def test_abstract_kb_instantiation():
+ """Test whether instantiation of abstract KB base class fails."""
+ with pytest.raises(TypeError):
+ KnowledgeBase(None, 3)
+
+
# fmt: off
@pytest.mark.parametrize(
"meet_threshold,config",
@@ -1151,7 +1171,7 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
def create_kb(vocab):
# create artificial KB
- mykb = KnowledgeBase(vocab, entity_vector_length=3)
+ mykb = InMemoryLookupKB(vocab, entity_vector_length=3)
mykb.add_entity(entity=entity_id, freq=12, entity_vector=[6, -4, 3])
mykb.add_alias(
alias="Mahler",
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index b946061f6..14a7a36e5 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -605,10 +605,35 @@ def test_update_with_annotates():
assert results[component] == ""
-def test_load_disable_enable() -> None:
- """
- Tests spacy.load() with dis-/enabling components.
- """
+@pytest.mark.issue(11443)
+def test_enable_disable_conflict_with_config():
+ """Test conflict between enable/disable w.r.t. `nlp.disabled` set in the config."""
+ nlp = English()
+ nlp.add_pipe("tagger")
+ nlp.add_pipe("senter")
+ nlp.add_pipe("sentencizer")
+
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ # Expected to fail, as config and arguments conflict.
+ with pytest.raises(ValueError):
+ spacy.load(
+ tmp_dir, enable=["tagger"], config={"nlp": {"disabled": ["senter"]}}
+ )
+ # Expected to succeed without warning due to the lack of a conflicting config option.
+ spacy.load(tmp_dir, enable=["tagger"])
+ # Expected to succeed with a warning, as disable=[] should override the config setting.
+ with pytest.warns(UserWarning):
+ spacy.load(
+ tmp_dir,
+ enable=["tagger"],
+ disable=[],
+ config={"nlp": {"disabled": ["senter"]}},
+ )
+
+
+def test_load_disable_enable():
+ """Tests spacy.load() with dis-/enabling components."""
base_nlp = English()
for pipe in ("sentencizer", "tagger", "parser"):
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 64faf133d..e423d9a19 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -230,6 +230,97 @@ def test_tok2vec_listener_callback():
assert get_dX(Y) is not None
+def test_tok2vec_listener_overfitting():
+ """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
+ orig_config = Config().from_str(cfg_string)
+ nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+ for i in range(50):
+ losses = {}
+ nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
+ assert losses["tagger"] < 0.00001
+
+ # test the trained model
+ test_text = "I like blue eggs"
+ doc = nlp(test_text)
+ assert doc[0].tag_ == "N"
+ assert doc[1].tag_ == "V"
+ assert doc[2].tag_ == "J"
+ assert doc[3].tag_ == "N"
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ assert doc2[0].tag_ == "N"
+ assert doc2[1].tag_ == "V"
+ assert doc2[2].tag_ == "J"
+ assert doc2[3].tag_ == "N"
+
+
+def test_tok2vec_frozen_not_annotating():
+ """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
+ orig_config = Config().from_str(cfg_string)
+ nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+ for i in range(2):
+ losses = {}
+ with pytest.raises(
+ ValueError, match=r"the tok2vec embedding layer is not updated"
+ ):
+ nlp.update(
+ train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
+ )
+
+
+def test_tok2vec_frozen_overfitting():
+ """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
+ orig_config = Config().from_str(cfg_string)
+ nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+ train_examples = []
+ for t in TRAIN_DATA:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+ optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+ for i in range(100):
+ losses = {}
+ nlp.update(
+ train_examples,
+ sgd=optimizer,
+ losses=losses,
+ exclude=["tok2vec"],
+ annotates=["tok2vec"],
+ )
+ assert losses["tagger"] < 0.0001
+
+ # test the trained model
+ test_text = "I like blue eggs"
+ doc = nlp(test_text)
+ assert doc[0].tag_ == "N"
+ assert doc[1].tag_ == "V"
+ assert doc[2].tag_ == "J"
+ assert doc[3].tag_ == "N"
+
+ # Also test the results are still the same after IO
+ with make_tempdir() as tmp_dir:
+ nlp.to_disk(tmp_dir)
+ nlp2 = util.load_model_from_path(tmp_dir)
+ doc2 = nlp2(test_text)
+ assert doc2[0].tag_ == "N"
+ assert doc2[1].tag_ == "V"
+ assert doc2[2].tag_ == "J"
+ assert doc2[3].tag_ == "N"
+
+
def test_replace_listeners():
orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
diff --git a/spacy/tests/serialize/test_resource_warning.py b/spacy/tests/serialize/test_resource_warning.py
index a00b2a688..38701c6d9 100644
--- a/spacy/tests/serialize/test_resource_warning.py
+++ b/spacy/tests/serialize/test_resource_warning.py
@@ -3,7 +3,7 @@ from unittest import TestCase
import pytest
import srsly
from numpy import zeros
-from spacy.kb import KnowledgeBase, Writer
+from spacy.kb.kb_in_memory import InMemoryLookupKB, Writer
from spacy.vectors import Vectors
from spacy.language import Language
from spacy.pipeline import TrainablePipe
@@ -71,7 +71,7 @@ def entity_linker():
nlp = Language()
def create_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=1)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=1)
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
return kb
@@ -120,7 +120,7 @@ def test_writer_with_path_py35():
def test_save_and_load_knowledge_base():
nlp = Language()
- kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ kb = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
path = d / "kb"
try:
@@ -129,7 +129,7 @@ def test_save_and_load_knowledge_base():
pytest.fail(str(e))
try:
- kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+ kb_loaded = InMemoryLookupKB(nlp.vocab, entity_vector_length=1)
kb_loaded.from_disk(path)
except Exception as e:
pytest.fail(str(e))
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 1e0ae3c76..8d3653ab1 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -2,7 +2,7 @@ from typing import Callable
from spacy import util
from spacy.util import ensure_path, registry, load_model_from_config
-from spacy.kb import KnowledgeBase
+from spacy.kb.kb_in_memory import InMemoryLookupKB
from spacy.vocab import Vocab
from thinc.api import Config
@@ -22,7 +22,7 @@ def test_serialize_kb_disk(en_vocab):
dir_path.mkdir()
file_path = dir_path / "kb"
kb1.to_disk(str(file_path))
- kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
+ kb2 = InMemoryLookupKB(vocab=en_vocab, entity_vector_length=3)
kb2.from_disk(str(file_path))
# final assertions
@@ -30,7 +30,7 @@ def test_serialize_kb_disk(en_vocab):
def _get_dummy_kb(vocab):
- kb = KnowledgeBase(vocab, entity_vector_length=3)
+ kb = InMemoryLookupKB(vocab, entity_vector_length=3)
kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3])
kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0])
kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7])
@@ -104,7 +104,7 @@ def test_serialize_subclassed_kb():
custom_field = 666
"""
- class SubKnowledgeBase(KnowledgeBase):
+ class SubInMemoryLookupKB(InMemoryLookupKB):
def __init__(self, vocab, entity_vector_length, custom_field):
super().__init__(vocab, entity_vector_length)
self.custom_field = custom_field
@@ -112,9 +112,9 @@ def test_serialize_subclassed_kb():
@registry.misc("spacy.CustomKB.v1")
def custom_kb(
entity_vector_length: int, custom_field: int
- ) -> Callable[[Vocab], KnowledgeBase]:
+ ) -> Callable[[Vocab], InMemoryLookupKB]:
def custom_kb_factory(vocab):
- kb = SubKnowledgeBase(
+ kb = SubInMemoryLookupKB(
vocab=vocab,
entity_vector_length=entity_vector_length,
custom_field=custom_field,
@@ -129,7 +129,7 @@ def test_serialize_subclassed_kb():
nlp.initialize()
entity_linker = nlp.get_pipe("entity_linker")
- assert type(entity_linker.kb) == SubKnowledgeBase
+ assert type(entity_linker.kb) == SubInMemoryLookupKB
assert entity_linker.kb.entity_vector_length == 342
assert entity_linker.kb.custom_field == 666
@@ -139,6 +139,6 @@ def test_serialize_subclassed_kb():
nlp2 = util.load_model_from_path(tmp_dir)
entity_linker2 = nlp2.get_pipe("entity_linker")
# After IO, the KB is the standard one
- assert type(entity_linker2.kb) == KnowledgeBase
+ assert type(entity_linker2.kb) == InMemoryLookupKB
assert entity_linker2.kb.entity_vector_length == 342
assert not hasattr(entity_linker2.kb, "custom_field")
diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py
index 9fcf18e2d..b948bb76c 100644
--- a/spacy/tests/serialize/test_serialize_pipeline.py
+++ b/spacy/tests/serialize/test_serialize_pipeline.py
@@ -404,10 +404,11 @@ def test_serialize_pipeline_disable_enable():
assert nlp3.component_names == ["ner", "tagger"]
with make_tempdir() as d:
nlp3.to_disk(d)
- nlp4 = spacy.load(d, disable=["ner"])
- assert nlp4.pipe_names == []
+ with pytest.warns(UserWarning):
+ nlp4 = spacy.load(d, disable=["ner"])
+ assert nlp4.pipe_names == ["tagger"]
assert nlp4.component_names == ["ner", "tagger"]
- assert nlp4.disabled == ["ner", "tagger"]
+ assert nlp4.disabled == ["ner"]
with make_tempdir() as d:
nlp.to_disk(d)
nlp5 = spacy.load(d, exclude=["tagger"])
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 2306cabb7..d91ed1201 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs():
- return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
+ return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model):
@@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
}
-def test_tok2vec():
+def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs())
diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py
index e3639c5da..35860a199 100644
--- a/spacy/tests/training/test_augmenters.py
+++ b/spacy/tests/training/test_augmenters.py
@@ -31,7 +31,7 @@ def doc(nlp):
words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."]
tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
- ents = ["B-PERSON", "I-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"]
+ ents = ["B-PERSON", "I-PERSON", "O", "", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"]
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
# fmt: on
doc = Doc(nlp.vocab, words=words, tags=tags, pos=pos, ents=ents)
@@ -106,6 +106,7 @@ def test_lowercase_augmenter(nlp, doc):
assert [(e.start, e.end, e.label) for e in eg.reference.ents] == ents
for ref_ent, orig_ent in zip(eg.reference.ents, doc.ents):
assert ref_ent.text == orig_ent.text.lower()
+ assert [t.ent_iob for t in doc] == [t.ent_iob for t in eg.reference]
assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc]
# check that augmentation works when lowercasing leads to different
@@ -166,7 +167,7 @@ def test_make_whitespace_variant(nlp):
lemmas = ["they", "fly", "to", "New", "York", "City", ".", "\n", "then", "they", "drive", "to", "Washington", ",", "D.C."]
heads = [1, 1, 1, 4, 5, 2, 1, 10, 10, 10, 10, 10, 11, 12, 12]
deps = ["nsubj", "ROOT", "prep", "compound", "compound", "pobj", "punct", "dep", "advmod", "nsubj", "ROOT", "prep", "pobj", "punct", "appos"]
- ents = ["O", "O", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"]
+ ents = ["O", "", "O", "B-GPE", "I-GPE", "I-GPE", "O", "O", "O", "O", "O", "O", "B-GPE", "O", "B-GPE"]
# fmt: on
doc = Doc(
nlp.vocab,
@@ -215,6 +216,8 @@ def test_make_whitespace_variant(nlp):
assert mod_ex2.reference[j].head.i == j - 1
# entities are well-formed
assert len(doc.ents) == len(mod_ex.reference.ents)
+ # there is one token with missing entity information
+ assert any(t.ent_iob == 0 for t in mod_ex.reference)
for ent in mod_ex.reference.ents:
assert not ent[0].is_space
assert not ent[-1].is_space
diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py
index 9630da261..6edcce13d 100644
--- a/spacy/tokens/_dict_proxies.py
+++ b/spacy/tokens/_dict_proxies.py
@@ -42,7 +42,8 @@ class SpanGroups(UserDict):
def copy(self, doc: Optional["Doc"] = None) -> "SpanGroups":
if doc is None:
doc = self._ensure_doc()
- return SpanGroups(doc).from_bytes(self.to_bytes())
+ data_copy = ((k, v.copy(doc=doc)) for k, v in self.items())
+ return SpanGroups(doc, items=data_copy)
def setdefault(self, key, default=None):
if not isinstance(default, SpanGroup):
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index a40fa74aa..f0cdaee87 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -72,7 +72,7 @@ class Doc:
lemmas: Optional[List[str]] = ...,
heads: Optional[List[int]] = ...,
deps: Optional[List[str]] = ...,
- sent_starts: Optional[List[Union[bool, None]]] = ...,
+ sent_starts: Optional[List[Union[bool, int, None]]] = ...,
ents: Optional[List[str]] = ...,
) -> None: ...
@property
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 7ba9a3341..295f91c28 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -217,9 +217,9 @@ cdef class Doc:
head in the doc. Defaults to None.
deps (Optional[List[str]]): A list of unicode strings, of the same
length as words, to assign as token.dep. Defaults to None.
- sent_starts (Optional[List[Union[bool, None]]]): A list of values, of
- the same length as words, to assign as token.is_sent_start. Will be
- overridden by heads if heads is provided. Defaults to None.
+ sent_starts (Optional[List[Union[bool, int, None]]]): A list of values,
+ of the same length as words, to assign as token.is_sent_start. Will
+ be overridden by heads if heads is provided. Defaults to None.
ents (Optional[List[str]]): A list of unicode strings, of the same
length as words, as IOB tags to assign as token.ent_iob and
token.ent_type. Defaults to None.
@@ -285,6 +285,7 @@ cdef class Doc:
heads = [0] * len(deps)
if heads and not deps:
raise ValueError(Errors.E1017)
+ sent_starts = list(sent_starts) if sent_starts is not None else None
if sent_starts is not None:
for i in range(len(sent_starts)):
if sent_starts[i] is True:
@@ -300,12 +301,11 @@ cdef class Doc:
ent_iobs = None
ent_types = None
if ents is not None:
+ ents = [ent if ent != "" else None for ent in ents]
iob_strings = Token.iob_strings()
# make valid IOB2 out of IOB1 or IOB2
for i, ent in enumerate(ents):
- if ent is "":
- ents[i] = None
- elif ent is not None and not isinstance(ent, str):
+ if ent is not None and not isinstance(ent, str):
raise ValueError(Errors.E177.format(tag=ent))
if i < len(ents) - 1:
# OI -> OB
@@ -1608,24 +1608,20 @@ cdef class Doc:
Doc.set_extension(attr)
self._.set(attr, doc_json["_"][attr])
- if doc_json.get("underscore_token", {}):
- for token_attr in doc_json["underscore_token"]:
- token_start = doc_json["underscore_token"][token_attr]["token_start"]
- value = doc_json["underscore_token"][token_attr]["value"]
-
- if not Token.has_extension(token_attr):
- Token.set_extension(token_attr)
- self[token_start]._.set(token_attr, value)
+ for token_attr in doc_json.get("underscore_token", {}):
+ if not Token.has_extension(token_attr):
+ Token.set_extension(token_attr)
+ for token_data in doc_json["underscore_token"][token_attr]:
+ start = token_by_char(self.c, self.length, token_data["start"])
+ value = token_data["value"]
+ self[start]._.set(token_attr, value)
- if doc_json.get("underscore_span", {}):
- for span_attr in doc_json["underscore_span"]:
- token_start = doc_json["underscore_span"][span_attr]["token_start"]
- token_end = doc_json["underscore_span"][span_attr]["token_end"]
- value = doc_json["underscore_span"][span_attr]["value"]
-
- if not Span.has_extension(span_attr):
- Span.set_extension(span_attr)
- self[token_start:token_end]._.set(span_attr, value)
+ for span_attr in doc_json.get("underscore_span", {}):
+ if not Span.has_extension(span_attr):
+ Span.set_extension(span_attr)
+ for span_data in doc_json["underscore_span"][span_attr]:
+ value = span_data["value"]
+ self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
return self
def to_json(self, underscore=None):
@@ -1673,30 +1669,34 @@ cdef class Doc:
if underscore:
user_keys = set()
if self.user_data:
- data["_"] = {}
- data["underscore_token"] = {}
- data["underscore_span"] = {}
- for data_key in self.user_data:
+ for data_key, value in self.user_data.copy().items():
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
attr = data_key[1]
start = data_key[2]
end = data_key[3]
if attr in underscore:
user_keys.add(attr)
- value = self.user_data[data_key]
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute
if start is None:
+ if "_" not in data:
+ data["_"] = {}
data["_"][attr] = value
# Check if token attribute
elif end is None:
+ if "underscore_token" not in data:
+ data["underscore_token"] = {}
if attr not in data["underscore_token"]:
- data["underscore_token"][attr] = {"token_start": start, "value": value}
+ data["underscore_token"][attr] = []
+ data["underscore_token"][attr].append({"start": start, "value": value})
# Else span attribute
else:
+ if "underscore_span" not in data:
+ data["underscore_span"] = {}
if attr not in data["underscore_span"]:
- data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
+ data["underscore_span"][attr] = []
+ data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
for attr in underscore:
if attr not in user_keys:
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
index 245eb4dbe..21cd124ab 100644
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterable
+from typing import Any, Dict, Iterable, Optional
from .doc import Doc
from .span import Span
@@ -24,4 +24,4 @@ class SpanGroup:
def __getitem__(self, i: int) -> Span: ...
def to_bytes(self) -> bytes: ...
def from_bytes(self, bytes_data: bytes) -> SpanGroup: ...
- def copy(self) -> SpanGroup: ...
+ def copy(self, doc: Optional[Doc] = ...) -> SpanGroup: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index bb0fab24f..1aa3c0bc8 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -241,15 +241,18 @@ cdef class SpanGroup:
cdef void push_back(self, SpanC span) nogil:
self.c.push_back(span)
- def copy(self) -> SpanGroup:
+ def copy(self, doc: Optional["Doc"] = None) -> SpanGroup:
"""Clones the span group.
+ doc (Doc): New reference document to which the copy is bound.
RETURNS (SpanGroup): A copy of the span group.
DOCS: https://spacy.io/api/spangroup#copy
"""
+ if doc is None:
+ doc = self.doc
return SpanGroup(
- self.doc,
+ doc,
name=self.name,
attrs=deepcopy(self.attrs),
spans=list(self),
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 55d780ba4..2fe8c24fb 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -6,7 +6,7 @@ from functools import partial
from ..util import registry
from .example import Example
-from .iob_utils import split_bilu_label
+from .iob_utils import split_bilu_label, _doc_to_biluo_tags_with_partial
if TYPE_CHECKING:
from ..language import Language # noqa: F401
@@ -62,6 +62,9 @@ def combined_augmenter(
if orth_variants and random.random() < orth_level:
raw_text = example.text
orig_dict = example.to_dict()
+ orig_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+ example.reference
+ )
variant_text, variant_token_annot = make_orth_variants(
nlp,
raw_text,
@@ -128,6 +131,9 @@ def lower_casing_augmenter(
def make_lowercase_variant(nlp: "Language", example: Example):
example_dict = example.to_dict()
+ example_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+ example.reference
+ )
doc = nlp.make_doc(example.text.lower())
example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference]
return example.from_dict(doc, example_dict)
@@ -146,6 +152,9 @@ def orth_variants_augmenter(
else:
raw_text = example.text
orig_dict = example.to_dict()
+ orig_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+ example.reference
+ )
variant_text, variant_token_annot = make_orth_variants(
nlp,
raw_text,
@@ -248,6 +257,9 @@ def make_whitespace_variant(
RETURNS (Example): Example with one additional space token.
"""
example_dict = example.to_dict()
+ example_dict["doc_annotation"]["entities"] = _doc_to_biluo_tags_with_partial(
+ example.reference
+ )
doc_dict = example_dict.get("doc_annotation", {})
token_dict = example_dict.get("token_annotation", {})
# returned unmodified if:
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 61f83a1c3..0d4d246b0 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -60,6 +60,14 @@ def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
)
+def _doc_to_biluo_tags_with_partial(doc: Doc) -> List[str]:
+ ents = doc_to_biluo_tags(doc, missing="-")
+ for i, token in enumerate(doc):
+ if token.ent_iob == 2:
+ ents[i] = "O"
+ return ents
+
+
def offsets_to_biluo_tags(
doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
) -> List[str]:
diff --git a/spacy/util.py b/spacy/util.py
index 4e1a62d05..3034808ba 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -67,7 +67,6 @@ LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru",
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining", "initialize"]
# fmt: on
-
logger = logging.getLogger("spacy")
logger_stream_handler = logging.StreamHandler()
logger_stream_handler.setFormatter(
@@ -394,13 +393,17 @@ def get_module_path(module: ModuleType) -> Path:
return file_path.parent
+# Default value for passed enable/disable values.
+_DEFAULT_EMPTY_PIPES = SimpleFrozenList()
+
+
def load_model(
name: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
- disable: Union[str, Iterable[str]] = SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a package or data path.
@@ -470,9 +473,9 @@ def load_model_from_path(
*,
meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True,
- disable: Union[str, Iterable[str]] = SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
@@ -516,9 +519,9 @@ def load_model_from_config(
*,
meta: Dict[str, Any] = SimpleFrozenDict(),
vocab: Union["Vocab", bool] = True,
- disable: Union[str, Iterable[str]] = SimpleFrozenList(),
- enable: Union[str, Iterable[str]] = SimpleFrozenList(),
- exclude: Union[str, Iterable[str]] = SimpleFrozenList(),
+ disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
+ exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
auto_fill: bool = False,
validate: bool = True,
) -> "Language":
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 2537faff6..4c5447f75 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -11,6 +11,7 @@ menu:
- ['Text Classification', 'textcat']
- ['Span Classification', 'spancat']
- ['Entity Linking', 'entitylinker']
+ - ['Coreference', 'coref-architectures']
---
A **model architecture** is a function that wires up a
@@ -587,8 +588,8 @@ consists of either two or three subnetworks:
run once for each batch.
- **lower**: Construct a feature-specific vector for each `(token, feature)`
pair. This is also run once for each batch. Constructing the state
- representation is then a matter of summing the component features and
- applying the non-linearity.
+ representation is then a matter of summing the component features and applying
+ the non-linearity.
- **upper** (optional): A feed-forward network that predicts scores from the
state representation. If not present, the output from the lower model is used
as action scores directly.
@@ -628,8 +629,8 @@ same signature, but the `use_upper` argument was `True` by default.
> ```
Build a tagger model, using a provided token-to-vector component. The tagger
-model adds a linear layer with softmax activation to predict scores given
-the token vectors.
+model adds a linear layer with softmax activation to predict scores given the
+token vectors.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------ |
@@ -920,5 +921,84 @@ A function that reads an existing `KnowledgeBase` from file.
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of
plausible [`Candidate`](/api/kb/#candidate) objects. The default
-`CandidateGenerator` uses the text of a mention to find its potential
-aliases in the `KnowledgeBase`. Note that this function is case-dependent.
+`CandidateGenerator` uses the text of a mention to find its potential aliases in
+the `KnowledgeBase`. Note that this function is case-dependent.
+
+## Coreference {#coref-architectures tag="experimental"}
+
+A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to
+the same entity. A [`SpanResolver`](/api/span-resolver) component infers spans
+from single tokens. Together these components can be used to reproduce
+traditional coreference models. You can also omit the `SpanResolver` if working
+with only token-level clusters is acceptable.
+
+### spacy-experimental.Coref.v1 {#Coref tag="experimental"}
+
+> #### Example Config
+>
+> ```ini
+>
+> [model]
+> @architectures = "spacy-experimental.Coref.v1"
+> distance_embedding_size = 20
+> dropout = 0.3
+> hidden_size = 1024
+> depth = 2
+> antecedent_limit = 50
+> antecedent_batch_size = 512
+>
+> [model.tok2vec]
+> @architectures = "spacy-transformers.TransformerListener.v1"
+> grad_factor = 1.0
+> upstream = "transformer"
+> pooling = {"@layers":"reduce_mean.v1"}
+> ```
+
+The `Coref` model architecture is a Thinc `Model`.
+
+| Name | Description |
+| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
+| `distance_embedding_size` | A representation of the distance between candidates. ~~int~~ |
+| `dropout` | The dropout to use internally. Unlike some Thinc models, this has separate dropout for the internal PyTorch layers. ~~float~~ |
+| `hidden_size` | Size of the main internal layers. ~~int~~ |
+| `depth` | Depth of the internal network. ~~int~~ |
+| `antecedent_limit` | How many candidate antecedents to keep after rough scoring. This has a significant effect on memory usage. Typical values would be 50 to 200, or higher for very long documents. ~~int~~ |
+| `antecedent_batch_size` | Internal batch size. ~~int~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
+
+### spacy-experimental.SpanResolver.v1 {#SpanResolver tag="experimental"}
+
+> #### Example Config
+>
+> ```ini
+>
+> [model]
+> @architectures = "spacy-experimental.SpanResolver.v1"
+> hidden_size = 1024
+> distance_embedding_size = 64
+> conv_channels = 4
+> window_size = 1
+> max_distance = 128
+> prefix = "coref_head_clusters"
+>
+> [model.tok2vec]
+> @architectures = "spacy-transformers.TransformerListener.v1"
+> grad_factor = 1.0
+> upstream = "transformer"
+> pooling = {"@layers":"reduce_mean.v1"}
+> ```
+
+The `SpanResolver` model architecture is a Thinc `Model`. Note that
+`MentionClusters` is `List[List[Tuple[int, int]]]`.
+
+| Name | Description |
+| ------------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
+| `hidden_size` | Size of the main internal layers. ~~int~~ |
+| `distance_embedding_size` | A representation of the distance between two candidates. ~~int~~ |
+| `conv_channels` | The number of channels in the internal CNN. ~~int~~ |
+| `window_size` | The number of neighboring tokens to consider in the internal CNN. `1` means consider one token on each side. ~~int~~ |
+| `max_distance` | The longest possible length of a predicted span. ~~int~~ |
+| `prefix` | The prefix that indicates spans to use for input data. ~~string~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[MentionClusters]]~~ |
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index e5cd3089b..fc2c46022 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
```cli
-$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
+$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
```
> #### Example
@@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
+| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
diff --git a/website/docs/api/coref.md b/website/docs/api/coref.md
new file mode 100644
index 000000000..8f54422d6
--- /dev/null
+++ b/website/docs/api/coref.md
@@ -0,0 +1,353 @@
+---
+title: CoreferenceResolver
+tag: class,experimental
+source: spacy-experimental/coref/coref_component.py
+teaser: 'Pipeline component for word-level coreference resolution'
+api_base_class: /api/pipe
+api_string_name: coref
+api_trainable: true
+---
+
+> #### Installation
+>
+> ```bash
+> $ pip install -U spacy-experimental
+> ```
+
+
+
+This component is not yet integrated into spaCy core, and is available via the
+extension package
+[`spacy-experimental`](https://github.com/explosion/spacy-experimental) starting
+in version 0.6.0. It exposes the component via
+[entry points](/usage/saving-loading/#entry-points), so if you have the package
+installed, using `factory = "experimental_coref"` in your
+[training config](/usage/training#config) or
+`nlp.add_pipe("experimental_coref")` will work out-of-the-box.
+
+
+
+A `CoreferenceResolver` component groups tokens into clusters that refer to the
+same thing. Clusters are represented as SpanGroups that start with a prefix
+(`coref_clusters` by default).
+
+A `CoreferenceResolver` component can be paired with a
+[`SpanResolver`](/api/span-resolver) to expand single tokens to spans.
+
+## Assigned Attributes {#assigned-attributes}
+
+Predictions will be saved to `Doc.spans` as a [`SpanGroup`](/api/spangroup). The
+span key will be a prefix plus a serial number referring to the coreference
+cluster, starting from zero.
+
+The span key prefix defaults to `"coref_clusters"`, but can be passed as a
+parameter.
+
+| Location | Value |
+| ------------------------------------------ | ------------------------------------------------------------------------------------------------------- |
+| `Doc.spans[prefix + "_" + cluster_number]` | One coreference cluster, represented as single-token spans. Cluster numbers start from 1. ~~SpanGroup~~ |
+
+## Config and implementation {#config}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures#coref-architectures) documentation for
+details on the architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy_experimental.coref.coref_component import DEFAULT_COREF_MODEL
+> from spacy_experimental.coref.coref_util import DEFAULT_CLUSTER_PREFIX
+> config={
+> "model": DEFAULT_COREF_MODEL,
+> "span_cluster_prefix": DEFAULT_CLUSTER_PREFIX,
+> },
+> nlp.add_pipe("experimental_coref", config=config)
+> ```
+
+| Setting | Description |
+| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Coref](/api/architectures#Coref). ~~Model~~ |
+| `span_cluster_prefix` | The prefix for the keys for clusters saved to `doc.spans`. Defaults to `coref_clusters`. ~~str~~ |
+
+## CoreferenceResolver.\_\_init\_\_ {#init tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> coref = nlp.add_pipe("experimental_coref")
+>
+> # Construction via add_pipe with custom model
+> config = {"model": {"@architectures": "my_coref.v1"}}
+> coref = nlp.add_pipe("experimental_coref", config=config)
+>
+> # Construction from class
+> from spacy_experimental.coref.coref_component import CoreferenceResolver
+> coref = CoreferenceResolver(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#add_pipe).
+
+| Name | Description |
+| --------------------- | --------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `span_cluster_prefix` | The prefix for the key for saving clusters of spans. ~~bool~~ |
+
+## CoreferenceResolver.\_\_call\_\_ {#call tag="method"}
+
+Apply the pipe to one document. The document is modified in place and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/coref#call) and [`pipe`](/api/coref#pipe) delegate to the
+[`predict`](/api/coref#predict) and
+[`set_annotations`](/api/coref#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> coref = nlp.add_pipe("experimental_coref")
+> # This usually happens under the hood
+> processed = coref(doc)
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------- |
+| `doc` | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~ |
+
+## CoreferenceResolver.pipe {#pipe tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/coref#call) and
+[`pipe`](/api/coref#pipe) delegate to the [`predict`](/api/coref#predict) and
+[`set_annotations`](/api/coref#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> for doc in coref.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------- |
+| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS** | The processed documents in order. ~~Doc~~ |
+
+## CoreferenceResolver.initialize {#initialize tag="method"}
+
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. **At least one example
+should be supplied.** The data examples are used to **initialize the model** of
+the component and can either be the full training data or a representative
+sample. Initialization includes validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> coref.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name | Description |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ | |
+| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
+
+## CoreferenceResolver.predict {#predict tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them. Clusters are returned as a list of `MentionClusters`, one for
+each input `Doc`. A `MentionClusters` instance is just a list of lists of pairs
+of `int`s, where each item corresponds to a cluster, and the `int`s correspond
+to token indices.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> clusters = coref.predict([doc1, doc2])
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------------------------------------------------------- |
+| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The predicted coreference clusters for the `docs`. ~~List[MentionClusters]~~ |
+
+## CoreferenceResolver.set_annotations {#set_annotations tag="method"}
+
+Modify a batch of documents, saving coreference clusters in `Doc.spans`.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> clusters = coref.predict([doc1, doc2])
+> coref.set_annotations([doc1, doc2], clusters)
+> ```
+
+| Name | Description |
+| ---------- | ---------------------------------------------------------------------------- |
+| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
+| `clusters` | The predicted coreference clusters for the `docs`. ~~List[MentionClusters]~~ |
+
+## CoreferenceResolver.update {#update tag="method"}
+
+Learn from a batch of [`Example`](/api/example) objects. Delegates to
+[`predict`](/api/coref#predict).
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> optimizer = nlp.initialize()
+> losses = coref.update(examples, sgd=optimizer)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `drop` | The dropout rate. ~~float~~ |
+| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
+| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
+
+## CoreferenceResolver.create_optimizer {#create_optimizer tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> optimizer = coref.create_optimizer()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## CoreferenceResolver.use_params {#use_params tag="method, contextmanager"}
+
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> with coref.use_params(optimizer.averages):
+> coref.to_disk("/best_model")
+> ```
+
+| Name | Description |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## CoreferenceResolver.to_disk {#to_disk tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> coref.to_disk("/path/to/coref")
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+
+## CoreferenceResolver.from_disk {#from_disk tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> coref.from_disk("/path/to/coref")
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `CoreferenceResolver` object. ~~CoreferenceResolver~~ |
+
+## CoreferenceResolver.to_bytes {#to_bytes tag="method"}
+
+> #### Example
+>
+> ```python
+> coref = nlp.add_pipe("experimental_coref")
+> coref_bytes = coref.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring, including the `KnowledgeBase`.
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The serialized form of the `CoreferenceResolver` object. ~~bytes~~ |
+
+## CoreferenceResolver.from_bytes {#from_bytes tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> coref_bytes = coref.to_bytes()
+> coref = nlp.add_pipe("experimental_coref")
+> coref.from_bytes(coref_bytes)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data` | The data to load from. ~~bytes~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The `CoreferenceResolver` object. ~~CoreferenceResolver~~ |
+
+## Serialization fields {#serialization-fields}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = coref.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index f97f4ad83..f97ed4547 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -31,21 +31,21 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
-| Name | Description |
-| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | A storage container for lexical types. ~~Vocab~~ |
-| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
-| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
-| _keyword-only_ | |
-| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
-| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
-| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
-| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
-| `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
-| `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
-| `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
-| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~ |
-| `ents` 3 | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
+| Name | Description |
+| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | A storage container for lexical types. ~~Vocab~~ |
+| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
+| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
+| _keyword-only_ | |
+| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
+| `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
+| `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
+| `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
+| `lemmas` 3 | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
+| `heads` 3 | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
+| `deps` 3 | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~ |
+| `sent_starts` 3 | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, int, None]]]~~ |
+| `ents` 3 | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 43e08a39c..40ec8afb5 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -14,7 +14,8 @@ entities) to unique identifiers, grounding the named entities into the "real
world". It requires a `KnowledgeBase`, as well as a function to generate
plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a machine learning model to pick the right candidate, given the local
-context of the mention.
+context of the mention. `EntityLinker` defaults to using the
+[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
## Assigned Attributes {#assigned-attributes}
@@ -170,7 +171,7 @@ with the current vocab.
>
> ```python
> def create_kb(vocab):
-> kb = KnowledgeBase(vocab, entity_vector_length=128)
+> kb = InMemoryLookupKB(vocab, entity_vector_length=128)
> kb.add_entity(...)
> kb.add_alias(...)
> return kb
diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md
index e7a8fcd6f..b217a1678 100644
--- a/website/docs/api/kb.md
+++ b/website/docs/api/kb.md
@@ -4,27 +4,45 @@ teaser:
A storage class for entities and aliases of a specific knowledge base
(ontology)
tag: class
-source: spacy/kb.pyx
+source: spacy/kb/kb.pyx
new: 2.2
---
-The `KnowledgeBase` object provides a method to generate
-[`Candidate`](/api/kb/#candidate) objects, which are plausible external
+The `KnowledgeBase` object is an abstract class providing a method to generate
+[`Candidate`](/api/kb#candidate) objects, which are plausible external
identifiers given a certain textual mention. Each such `Candidate` holds
information from the relevant KB entities, such as its frequency in text and
possible aliases. Each entity in the knowledge base also has a pretrained entity
vector of a fixed size.
+Beyond that, `KnowledgeBase` classes have to implement a number of utility
+functions called by the [`EntityLinker`](/api/entitylinker) component.
+
+
+
+This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
+implementation up to that point is available as `InMemoryLookupKB` from 3.5
+onwards.
+
+
+
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
-Create the knowledge base.
+`KnowledgeBase` is an abstract class and cannot be instantiated. Its child
+classes should call `__init__()` to set up some necessary attributes.
> #### Example
>
> ```python
> from spacy.kb import KnowledgeBase
+> from spacy.vocab import Vocab
+>
+> class FullyImplementedKB(KnowledgeBase):
+> def __init__(self, vocab: Vocab, entity_vector_length: int):
+> super().__init__(vocab, entity_vector_length)
+> ...
> vocab = nlp.vocab
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
> ```
| Name | Description |
@@ -40,133 +58,66 @@ The length of the fixed-size entity vectors in the knowledge base.
| ----------- | ------------------------------------------------ |
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
-## KnowledgeBase.add_entity {#add_entity tag="method"}
+## KnowledgeBase.get_candidates {#get_candidates tag="method"}
-Add an entity to the knowledge base, specifying its corpus frequency and entity
-vector, which should be of length
-[`entity_vector_length`](/api/kb#entity_vector_length).
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate).
> #### Example
>
> ```python
-> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
-> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates(doc[0:2])
> ```
-| Name | Description |
-| --------------- | ---------------------------------------------------------- |
-| `entity` | The unique entity identifier. ~~str~~ |
-| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
-| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
+| Name | Description |
+| ----------- | -------------------------------------------------------------------- |
+| `mention` | The textual mention or alias. ~~Span~~ |
+| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
-## KnowledgeBase.set_entities {#set_entities tag="method"}
+## KnowledgeBase.get_candidates_batch {#get_candidates_batch tag="method"}
-Define the full list of entities in the knowledge base, specifying the corpus
-frequency and entity vector for each entity.
+Same as [`get_candidates()`](/api/kb#get_candidates), but for an arbitrary
+number of mentions. The [`EntityLinker`](/api/entitylinker) component will call
+`get_candidates_batch()` instead of `get_candidates()`, if the config parameter
+`candidates_batch_size` is greater or equal than 1.
+
+The default implementation of `get_candidates_batch()` executes
+`get_candidates()` in a loop. We recommend implementing a more efficient way to
+retrieve candidates for multiple mentions at once, if performance is of concern
+to you.
> #### Example
>
> ```python
-> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates((doc[0:2], doc[3:]))
> ```
-| Name | Description |
-| ------------- | ---------------------------------------------------------------- |
-| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
-| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
-| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
-
-## KnowledgeBase.add_alias {#add_alias tag="method"}
-
-Add an alias or mention to the knowledge base, specifying its potential KB
-identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb#add_entity) or
-[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
-should not exceed 1. Note that an empty string can not be used as alias.
-
-> #### Example
->
-> ```python
-> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
-> ```
-
-| Name | Description |
-| --------------- | --------------------------------------------------------------------------------- |
-| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
-| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
-| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
-
-## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
-
-Get the total number of entities in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_entities = len(kb)
-> ```
-
-| Name | Description |
-| ----------- | ----------------------------------------------------- |
-| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
-
-## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
-
-Get a list of all entity IDs in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_entities = kb.get_entity_strings()
-> ```
-
-| Name | Description |
-| ----------- | --------------------------------------------------------- |
-| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
-
-## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
-
-Get the total number of aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> total_aliases = kb.get_size_aliases()
-> ```
-
-| Name | Description |
-| ----------- | ---------------------------------------------------- |
-| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
-
-## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
-
-Get a list of all aliases in the knowledge base.
-
-> #### Example
->
-> ```python
-> all_aliases = kb.get_alias_strings()
-> ```
-
-| Name | Description |
-| ----------- | -------------------------------------------------------- |
-| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
+| Name | Description |
+| ----------- | -------------------------------------------------------------------------------------------- |
+| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
+| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
## KnowledgeBase.get_alias_candidates {#get_alias_candidates tag="method"}
-Given a certain textual mention as input, retrieve a list of candidate entities
-of type [`Candidate`](/api/kb/#candidate).
+
+This method is _not_ available from spaCy 3.5 onwards.
+
-> #### Example
->
-> ```python
-> candidates = kb.get_alias_candidates("Douglas")
-> ```
-
-| Name | Description |
-| ----------- | ------------------------------------------------------------- |
-| `alias` | The textual mention or alias. ~~str~~ |
-| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
+[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
+more flexibility in customizing knowledge bases. Some of its methods were moved
+to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
+being `get_alias_candidates()`. This method is now available as
+[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+defaults to
+[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
## KnowledgeBase.get_vector {#get_vector tag="method"}
@@ -178,27 +129,30 @@ Given a certain entity ID, retrieve its pretrained entity vector.
> vector = kb.get_vector("Q42")
> ```
-| Name | Description |
-| ----------- | ------------------------------------ |
-| `entity` | The entity ID. ~~str~~ |
-| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
+| Name | Description |
+| ----------- | -------------------------------------- |
+| `entity` | The entity ID. ~~str~~ |
+| **RETURNS** | The entity vector. ~~Iterable[float]~~ |
-## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
+## KnowledgeBase.get_vectors {#get_vectors tag="method"}
-Given a certain entity ID and a certain textual mention, retrieve the prior
-probability of the fact that the mention links to the entity ID.
+Same as [`get_vector()`](/api/kb#get_vector), but for an arbitrary number of
+entity IDs.
+
+The default implementation of `get_vectors()` executes `get_vector()` in a loop.
+We recommend implementing a more efficient way to retrieve vectors for multiple
+entities at once, if performance is of concern to you.
> #### Example
>
> ```python
-> probability = kb.get_prior_prob("Q42", "Douglas")
+> vectors = kb.get_vectors(("Q42", "Q3107329"))
> ```
-| Name | Description |
-| ----------- | ------------------------------------------------------------------------- |
-| `entity` | The entity ID. ~~str~~ |
-| `alias` | The textual mention or alias. ~~str~~ |
-| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| `entities` | The entity IDs. ~~Iterable[str]~~ |
+| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
## KnowledgeBase.to_disk {#to_disk tag="method"}
@@ -207,12 +161,13 @@ Save the current state of the knowledge base to a directory.
> #### Example
>
> ```python
-> kb.to_disk(loc)
+> kb.to_disk(path)
> ```
-| Name | Description |
-| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| Name | Description |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
## KnowledgeBase.from_disk {#from_disk tag="method"}
@@ -222,16 +177,16 @@ Restore the state of the knowledge base from a given directory. Note that the
> #### Example
>
> ```python
-> from spacy.kb import KnowledgeBase
> from spacy.vocab import Vocab
> vocab = Vocab().from_disk("/path/to/vocab")
-> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
> kb.from_disk("/path/to/kb")
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------- |
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
## Candidate {#candidate tag="class"}
diff --git a/website/docs/api/kb_in_memory.md b/website/docs/api/kb_in_memory.md
new file mode 100644
index 000000000..9e3279e6a
--- /dev/null
+++ b/website/docs/api/kb_in_memory.md
@@ -0,0 +1,302 @@
+---
+title: InMemoryLookupKB
+teaser:
+ The default implementation of the KnowledgeBase interface. Stores all
+ information in-memory.
+tag: class
+source: spacy/kb/kb_in_memory.pyx
+new: 3.5
+---
+
+The `InMemoryLookupKB` class inherits from [`KnowledgeBase`](/api/kb) and
+implements all of its methods. It stores all KB data in-memory and generates
+[`Candidate`](/api/kb#candidate) objects by exactly matching mentions with
+entity names. It's highly optimized for both a low memory footprint and speed of
+retrieval.
+
+## InMemoryLookupKB.\_\_init\_\_ {#init tag="method"}
+
+Create the knowledge base.
+
+> #### Example
+>
+> ```python
+> from spacy.kb import InMemoryLookupKB
+> vocab = nlp.vocab
+> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
+> ```
+
+| Name | Description |
+| ---------------------- | ------------------------------------------------ |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
+
+## InMemoryLookupKB.entity_vector_length {#entity_vector_length tag="property"}
+
+The length of the fixed-size entity vectors in the knowledge base.
+
+| Name | Description |
+| ----------- | ------------------------------------------------ |
+| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
+
+## InMemoryLookupKB.add_entity {#add_entity tag="method"}
+
+Add an entity to the knowledge base, specifying its corpus frequency and entity
+vector, which should be of length
+[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+
+> #### Example
+>
+> ```python
+> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
+> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
+> ```
+
+| Name | Description |
+| --------------- | ---------------------------------------------------------- |
+| `entity` | The unique entity identifier. ~~str~~ |
+| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
+| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
+
+## InMemoryLookupKB.set_entities {#set_entities tag="method"}
+
+Define the full list of entities in the knowledge base, specifying the corpus
+frequency and entity vector for each entity.
+
+> #### Example
+>
+> ```python
+> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
+> ```
+
+| Name | Description |
+| ------------- | ---------------------------------------------------------------- |
+| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
+| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
+| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
+
+## InMemoryLookupKB.add_alias {#add_alias tag="method"}
+
+Add an alias or mention to the knowledge base, specifying its potential KB
+identifiers and their prior probabilities. The entity identifiers should refer
+to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
+or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+probabilities should not exceed 1. Note that an empty string can not be used as
+alias.
+
+> #### Example
+>
+> ```python
+> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
+> ```
+
+| Name | Description |
+| --------------- | --------------------------------------------------------------------------------- |
+| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
+| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
+| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
+
+## InMemoryLookupKB.\_\_len\_\_ {#len tag="method"}
+
+Get the total number of entities in the knowledge base.
+
+> #### Example
+>
+> ```python
+> total_entities = len(kb)
+> ```
+
+| Name | Description |
+| ----------- | ----------------------------------------------------- |
+| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
+
+## InMemoryLookupKB.get_entity_strings {#get_entity_strings tag="method"}
+
+Get a list of all entity IDs in the knowledge base.
+
+> #### Example
+>
+> ```python
+> all_entities = kb.get_entity_strings()
+> ```
+
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
+
+## InMemoryLookupKB.get_size_aliases {#get_size_aliases tag="method"}
+
+Get the total number of aliases in the knowledge base.
+
+> #### Example
+>
+> ```python
+> total_aliases = kb.get_size_aliases()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------------------------------- |
+| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
+
+## InMemoryLookupKB.get_alias_strings {#get_alias_strings tag="method"}
+
+Get a list of all aliases in the knowledge base.
+
+> #### Example
+>
+> ```python
+> all_aliases = kb.get_alias_strings()
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------- |
+| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
+
+## InMemoryLookupKB.get_candidates {#get_candidates tag="method"}
+
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate). Wraps
+[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+
+> #### Example
+>
+> ```python
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates(doc[0:2])
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------------------- |
+| `mention` | The textual mention or alias. ~~Span~~ |
+| **RETURNS** | An iterable of relevant `Candidate` objects. ~~Iterable[Candidate]~~ |
+
+## InMemoryLookupKB.get_candidates_batch {#get_candidates_batch tag="method"}
+
+Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
+will call `get_candidates_batch()` instead of `get_candidates()`, if the config
+parameter `candidates_batch_size` is greater or equal than 1.
+
+The default implementation of `get_candidates_batch()` executes
+`get_candidates()` in a loop. We recommend implementing a more efficient way to
+retrieve candidates for multiple mentions at once, if performance is of concern
+to you.
+
+> #### Example
+>
+> ```python
+> from spacy.lang.en import English
+> nlp = English()
+> doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
+> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------------------------------------------------------------------- |
+| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ |
+| **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
+
+## InMemoryLookupKB.get_alias_candidates {#get_alias_candidates tag="method"}
+
+Given a certain textual mention as input, retrieve a list of candidate entities
+of type [`Candidate`](/api/kb#candidate).
+
+> #### Example
+>
+> ```python
+> candidates = kb.get_alias_candidates("Douglas")
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------- |
+| `alias` | The textual mention or alias. ~~str~~ |
+| **RETURNS** | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
+
+## InMemoryLookupKB.get_vector {#get_vector tag="method"}
+
+Given a certain entity ID, retrieve its pretrained entity vector.
+
+> #### Example
+>
+> ```python
+> vector = kb.get_vector("Q42")
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------ |
+| `entity` | The entity ID. ~~str~~ |
+| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
+
+## InMemoryLookupKB.get_vectors {#get_vectors tag="method"}
+
+Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+number of entity IDs.
+
+The default implementation of `get_vectors()` executes `get_vector()` in a loop.
+We recommend implementing a more efficient way to retrieve vectors for multiple
+entities at once, if performance is of concern to you.
+
+> #### Example
+>
+> ```python
+> vectors = kb.get_vectors(("Q42", "Q3107329"))
+> ```
+
+| Name | Description |
+| ----------- | --------------------------------------------------------- |
+| `entities` | The entity IDs. ~~Iterable[str]~~ |
+| **RETURNS** | The entity vectors. ~~Iterable[Iterable[numpy.ndarray]]~~ |
+
+## InMemoryLookupKB.get_prior_prob {#get_prior_prob tag="method"}
+
+Given a certain entity ID and a certain textual mention, retrieve the prior
+probability of the fact that the mention links to the entity ID.
+
+> #### Example
+>
+> ```python
+> probability = kb.get_prior_prob("Q42", "Douglas")
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------------------- |
+| `entity` | The entity ID. ~~str~~ |
+| `alias` | The textual mention or alias. ~~str~~ |
+| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
+
+## InMemoryLookupKB.to_disk {#to_disk tag="method"}
+
+Save the current state of the knowledge base to a directory.
+
+> #### Example
+>
+> ```python
+> kb.to_disk(path)
+> ```
+
+| Name | Description |
+| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
+
+## InMemoryLookupKB.from_disk {#from_disk tag="method"}
+
+Restore the state of the knowledge base from a given directory. Note that the
+[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
+
+> #### Example
+>
+> ```python
+> from spacy.vocab import Vocab
+> vocab = Vocab().from_disk("/path/to/vocab")
+> kb = FullyImplementedKB(vocab=vocab, entity_vector_length=64)
+> kb.from_disk("/path/to/kb")
+> ```
+
+| Name | Description |
+| ----------- | ----------------------------------------------------------------------------------------------- |
+| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| `exclude` | List of components to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index ed763e36a..767a7450a 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -164,6 +164,9 @@ examples, see the
Apply the pipeline to some text. The text can span multiple sentences, and can
contain arbitrary whitespace. Alignment into the original string is preserved.
+Instead of text, a `Doc` can be passed as input, in which case tokenization is
+skipped, but the rest of the pipeline is run.
+
> #### Example
>
> ```python
@@ -173,7 +176,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
| Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `text` | The text to be processed. ~~str~~ |
+| `text` | The text to be processed, or a Doc. ~~Union[str, Doc]~~ |
| _keyword-only_ | |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
@@ -184,6 +187,9 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
Process texts as a stream, and yield `Doc` objects in order. This is usually
more efficient than processing texts one-by-one.
+Instead of text, a `Doc` object can be passed as input. In this case
+tokenization is skipped but the rest of the pipeline is run.
+
> #### Example
>
> ```python
@@ -194,7 +200,7 @@ more efficient than processing texts one-by-one.
| Name | Description |
| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `texts` | A sequence of strings. ~~Iterable[str]~~ |
+| `texts` | A sequence of strings (or `Doc` objects). ~~Iterable[Union[str, Doc]]~~ |
| _keyword-only_ | |
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 422f34040..905096338 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -70,7 +70,7 @@ lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
[`token.pos`](/api/token) from a previous pipeline component (see example
pipeline configurations in the
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
-libraries (`pymorphy2`).
+libraries (`pymorphy3`).
| Language | Default Mode |
| -------- | ------------ |
@@ -86,9 +86,9 @@ libraries (`pymorphy2`).
| `nb` | `rule` |
| `nl` | `rule` |
| `pl` | `pos_lookup` |
-| `ru` | `pymorphy2` |
+| `ru` | `pymorphy3` |
| `sv` | `rule` |
-| `uk` | `pymorphy2` |
+| `uk` | `pymorphy3` |
```python
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 1b7017ca7..070292782 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -153,3 +153,36 @@ whole pipeline has run.
| `attrs` | A dict of the `Doc` attributes and the values to set them to. Defaults to `{"tensor": None, "_.trf_data": None}` to clean up after `tok2vec` and `transformer` components. ~~dict~~ |
| `silent` | If `False`, show warnings if attributes aren't found or can't be set. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The modified `Doc` with the modified attributes. ~~Doc~~ |
+
+## span_cleaner {#span_cleaner tag="function,experimental"}
+
+Remove `SpanGroup`s from `doc.spans` based on a key prefix. This is used to
+clean up after the [`CoreferenceResolver`](/api/coref) when it's paired with a
+[`SpanResolver`](/api/span-resolver).
+
+
+
+This pipeline function is not yet integrated into spaCy core, and is available
+via the extension package
+[`spacy-experimental`](https://github.com/explosion/spacy-experimental) starting
+in version 0.6.0. It exposes the component via
+[entry points](/usage/saving-loading/#entry-points), so if you have the package
+installed, using `factory = "span_cleaner"` in your
+[training config](/usage/training#config) or `nlp.add_pipe("span_cleaner")` will
+work out-of-the-box.
+
+
+
+> #### Example
+>
+> ```python
+> config = {"prefix": "coref_head_clusters"}
+> nlp.add_pipe("span_cleaner", config=config)
+> doc = nlp("text")
+> assert "coref_head_clusters_1" not in doc.spans
+> ```
+
+| Setting | Description |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `prefix` | A prefix to check `SpanGroup` keys for. Any matching groups will be removed. Defaults to `"coref_head_clusters"`. ~~str~~ |
+| **RETURNS** | The modified `Doc` with any matching spans removed. ~~Doc~~ |
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index 8dbe3b276..ca3462aa9 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -270,3 +270,62 @@ Compute micro-PRF and per-entity PRF scores.
| Name | Description |
| ---------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+
+## score_coref_clusters {#score_coref_clusters tag="experimental"}
+
+Returns LEA ([Moosavi and Strube, 2016](https://aclanthology.org/P16-1060/)) PRF
+scores for coreference clusters.
+
+
+
+Note this scoring function is not yet included in spaCy core - for details, see
+the [CoreferenceResolver](/api/coref) docs.
+
+
+
+> #### Example
+>
+> ```python
+> scores = score_coref_clusters(
+> examples,
+> span_cluster_prefix="coref_clusters",
+> )
+> print(scores["coref_f"])
+> ```
+
+| Name | Description |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `span_cluster_prefix` | The prefix used for spans representing coreference clusters. ~~str~~ |
+| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |
+
+## score_span_predictions {#score_span_predictions tag="experimental"}
+
+Return accuracy for reconstructions of spans from single tokens. Only exactly
+correct predictions are counted as correct, there is no partial credit for near
+answers. Used by the [SpanResolver](/api/span-resolver).
+
+
+
+Note this scoring function is not yet included in spaCy core - for details, see
+the [SpanResolver](/api/span-resolver) docs.
+
+
+
+> #### Example
+>
+> ```python
+> scores = score_span_predictions(
+> examples,
+> output_prefix="coref_clusters",
+> )
+> print(scores["span_coref_clusters_accuracy"])
+> ```
+
+| Name | Description |
+| --------------- | ------------------------------------------------------------------------------------------------------------------- |
+| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `output_prefix` | The prefix used for spans representing the final predicted spans. ~~str~~ |
+| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |
diff --git a/website/docs/api/span-resolver.md b/website/docs/api/span-resolver.md
new file mode 100644
index 000000000..3e992cd03
--- /dev/null
+++ b/website/docs/api/span-resolver.md
@@ -0,0 +1,356 @@
+---
+title: SpanResolver
+tag: class,experimental
+source: spacy-experimental/coref/span_resolver_component.py
+teaser: 'Pipeline component for resolving tokens into spans'
+api_base_class: /api/pipe
+api_string_name: span_resolver
+api_trainable: true
+---
+
+> #### Installation
+>
+> ```bash
+> $ pip install -U spacy-experimental
+> ```
+
+
+
+This component not yet integrated into spaCy core, and is available via the
+extension package
+[`spacy-experimental`](https://github.com/explosion/spacy-experimental) starting
+in version 0.6.0. It exposes the component via
+[entry points](/usage/saving-loading/#entry-points), so if you have the package
+installed, using `factory = "experimental_span_resolver"` in your
+[training config](/usage/training#config) or
+`nlp.add_pipe("experimental_span_resolver")` will work out-of-the-box.
+
+
+
+A `SpanResolver` component takes in tokens (represented as `Span` objects of
+length 1) and resolves them into `Span` objects of arbitrary length. The initial
+use case is as a post-processing step on word-level
+[coreference resolution](/api/coref). The input and output keys used to store
+`Span` objects are configurable.
+
+## Assigned Attributes {#assigned-attributes}
+
+Predictions will be saved to `Doc.spans` as [`SpanGroup`s](/api/spangroup).
+
+Input token spans will be read in using an input prefix, by default
+`"coref_head_clusters"`, and output spans will be saved using an output prefix
+(default `"coref_clusters"`) plus a serial number starting from one. The
+prefixes are configurable.
+
+| Location | Value |
+| ------------------------------------------------- | ------------------------------------------------------------------------- |
+| `Doc.spans[output_prefix + "_" + cluster_number]` | One group of predicted spans. Cluster number starts from 1. ~~SpanGroup~~ |
+
+## Config and implementation {#config}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures#coref-architectures) documentation for
+details on the architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy_experimental.coref.span_resolver_component import DEFAULT_SPAN_RESOLVER_MODEL
+> from spacy_experimental.coref.coref_util import DEFAULT_CLUSTER_PREFIX, DEFAULT_CLUSTER_HEAD_PREFIX
+> config={
+> "model": DEFAULT_SPAN_RESOLVER_MODEL,
+> "input_prefix": DEFAULT_CLUSTER_HEAD_PREFIX,
+> "output_prefix": DEFAULT_CLUSTER_PREFIX,
+> },
+> nlp.add_pipe("experimental_span_resolver", config=config)
+> ```
+
+| Setting | Description |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [SpanResolver](/api/architectures#SpanResolver). ~~Model~~ |
+| `input_prefix` | The prefix to use for input `SpanGroup`s. Defaults to `coref_head_clusters`. ~~str~~ |
+| `output_prefix` | The prefix for predicted `SpanGroup`s. Defaults to `coref_clusters`. ~~str~~ |
+
+## SpanResolver.\_\_init\_\_ {#init tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+>
+> # Construction via add_pipe with custom model
+> config = {"model": {"@architectures": "my_span_resolver.v1"}}
+> span_resolver = nlp.add_pipe("experimental_span_resolver", config=config)
+>
+> # Construction from class
+> from spacy_experimental.coref.span_resolver_component import SpanResolver
+> span_resolver = SpanResolver(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#add_pipe).
+
+| Name | Description |
+| --------------- | --------------------------------------------------------------------------------------------------- |
+| `vocab` | The shared vocabulary. ~~Vocab~~ |
+| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
+| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
+| _keyword-only_ | |
+| `input_prefix` | The prefix to use for input `SpanGroup`s. Defaults to `coref_head_clusters`. ~~str~~ |
+| `output_prefix` | The prefix for predicted `SpanGroup`s. Defaults to `coref_clusters`. ~~str~~ |
+
+## SpanResolver.\_\_call\_\_ {#call tag="method"}
+
+Apply the pipe to one document. The document is modified in place and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](#call) and [`pipe`](#pipe) delegate to the [`predict`](#predict)
+and [`set_annotations`](#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> # This usually happens under the hood
+> processed = span_resolver(doc)
+> ```
+
+| Name | Description |
+| ----------- | -------------------------------- |
+| `doc` | The document to process. ~~Doc~~ |
+| **RETURNS** | The processed document. ~~Doc~~ |
+
+## SpanResolver.pipe {#pipe tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/span-resolver#call) and
+[`pipe`](/api/span-resolver#pipe) delegate to the
+[`predict`](/api/span-resolver#predict) and
+[`set_annotations`](/api/span-resolver#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> for doc in span_resolver.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------- |
+| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
+| _keyword-only_ | |
+| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
+| **YIELDS** | The processed documents in order. ~~Doc~~ |
+
+## SpanResolver.initialize {#initialize tag="method"}
+
+Initialize the component for training. `get_examples` should be a function that
+returns an iterable of [`Example`](/api/example) objects. **At least one example
+should be supplied.** The data examples are used to **initialize the model** of
+the component and can either be the full training data or a representative
+sample. Initialization includes validating the network,
+[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
+setting up the label scheme based on the data. This method is typically called
+by [`Language.initialize`](/api/language#initialize).
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> span_resolver.initialize(lambda: examples, nlp=nlp)
+> ```
+
+| Name | Description |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Must contain at least one `Example`. ~~Callable[[], Iterable[Example]]~~ |
+| _keyword-only_ | |
+| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
+
+## SpanResolver.predict {#predict tag="method"}
+
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them. Predictions are returned as a list of `MentionClusters`, one for
+each input `Doc`. A `MentionClusters` instance is just a list of lists of pairs
+of `int`s, where each item corresponds to an input `SpanGroup`, and the `int`s
+correspond to token indices.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> spans = span_resolver.predict([doc1, doc2])
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------- |
+| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
+| **RETURNS** | The predicted spans for the `Doc`s. ~~List[MentionClusters]~~ |
+
+## SpanResolver.set_annotations {#set_annotations tag="method"}
+
+Modify a batch of documents, saving predictions using the output prefix in
+`Doc.spans`.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> spans = span_resolver.predict([doc1, doc2])
+> span_resolver.set_annotations([doc1, doc2], spans)
+> ```
+
+| Name | Description |
+| ------- | ------------------------------------------------------------- |
+| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
+| `spans` | The predicted spans for the `docs`. ~~List[MentionClusters]~~ |
+
+## SpanResolver.update {#update tag="method"}
+
+Learn from a batch of [`Example`](/api/example) objects. Delegates to
+[`predict`](/api/span-resolver#predict).
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> optimizer = nlp.initialize()
+> losses = span_resolver.update(examples, sgd=optimizer)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
+| _keyword-only_ | |
+| `drop` | The dropout rate. ~~float~~ |
+| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
+| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
+
+## SpanResolver.create_optimizer {#create_optimizer tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> optimizer = span_resolver.create_optimizer()
+> ```
+
+| Name | Description |
+| ----------- | ---------------------------- |
+| **RETURNS** | The optimizer. ~~Optimizer~~ |
+
+## SpanResolver.use_params {#use_params tag="method, contextmanager"}
+
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> with span_resolver.use_params(optimizer.averages):
+> span_resolver.to_disk("/best_model")
+> ```
+
+| Name | Description |
+| -------- | -------------------------------------------------- |
+| `params` | The parameter values to use in the model. ~~dict~~ |
+
+## SpanResolver.to_disk {#to_disk tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> span_resolver.to_disk("/path/to/span_resolver")
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
+| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+
+## SpanResolver.from_disk {#from_disk tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> span_resolver.from_disk("/path/to/span_resolver")
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------- |
+| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The modified `SpanResolver` object. ~~SpanResolver~~ |
+
+## SpanResolver.to_bytes {#to_bytes tag="method"}
+
+> #### Example
+>
+> ```python
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> span_resolver_bytes = span_resolver.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The serialized form of the `SpanResolver` object. ~~bytes~~ |
+
+## SpanResolver.from_bytes {#from_bytes tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> span_resolver_bytes = span_resolver.to_bytes()
+> span_resolver = nlp.add_pipe("experimental_span_resolver")
+> span_resolver.from_bytes(span_resolver_bytes)
+> ```
+
+| Name | Description |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `bytes_data` | The data to load from. ~~bytes~~ |
+| _keyword-only_ | |
+| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
+| **RETURNS** | The `SpanResolver` object. ~~SpanResolver~~ |
+
+## Serialization fields {#serialization-fields}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = span_resolver.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md
index 8dbdefc01..2d1cf73c4 100644
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@@ -255,9 +255,10 @@ Return a copy of the span group.
> new_group = doc.spans["errors"].copy()
> ```
-| Name | Description |
-| ----------- | ----------------------------------------------- |
-| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
+| Name | Description |
+| ----------- | -------------------------------------------------------------------------------------------------- |
+| `doc` | The document to which the copy is bound. Defaults to `None` for the current doc. ~~Optional[Doc]~~ |
+| **RETURNS** | A copy of the `SpanGroup` object. ~~SpanGroup~~ |
## SpanGroup.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index 22e2b961e..4ebca2756 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -78,7 +78,9 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
| Name | Description |
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
-| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
+| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
+| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
+| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
| [`Morphology`](/api/morphology) | Store morphological analyses and map them to and from hash values. |
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 35150035a..90b612358 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -148,6 +148,13 @@ skipped. You can also set `--force` to force re-running a command, or `--dry` to
perform a "dry run" and see what would happen (without actually running the
script).
+Since spaCy v3.4.2, `spacy projects run` checks your installed dependencies to
+verify that your environment is properly set up and aligns with the project's
+`requirements.txt`, if there is one. If missing or conflicting dependencies are
+detected, a corresponding warning is displayed. If you'd like to disable the
+dependency check, set `check_requirements: false` in your project's
+`project.yml`.
+
### 4. Run a workflow {#run-workfow}
> #### project.yml
@@ -226,26 +233,49 @@ pipelines.
```yaml
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
```
+
> #### Tip: Overriding variables on the CLI
>
-> If you want to override one or more variables on the CLI and are not already specifying a
-> project directory, you need to add `.` as a placeholder:
+> If you want to override one or more variables on the CLI and are not already
+> specifying a project directory, you need to add `.` as a placeholder:
>
> ```
> python -m spacy project run test . --vars.foo bar
> ```
-| Section | Description |
-| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
-| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
-| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
-| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
-| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
-| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
-| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
-| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
-| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
+> #### Tip: Environment Variables
+>
+> Commands in a project file are not executed in a shell, so they don't have
+> direct access to environment variables. But you can insert environment
+> variables using the `env` dictionary to make values available for
+> interpolation, just like values in `vars`. Here's an example `env` dict that
+> makes `$PATH` available as `ENV_PATH`:
+>
+> ```yaml
+> env:
+> ENV_PATH: PATH
+> ```
+>
+> This can be used in a project command like so:
+>
+> ```yaml
+> - name: "echo-path"
+> script:
+> - "echo ${env.ENV_PATH}"
+> ```
+
+| Section | Description |
+| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
+| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
+| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
+| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
+| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
+| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
+| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
+| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
+| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
+| `check_requirements` 3.4.2 | A flag determining whether to verify that the installed dependencies align with the project's `requirements.txt`. Defaults to `true`. |
### Data assets {#data-assets}
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 5e064b269..27a8bbca7 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -480,7 +480,7 @@ as-is. They are also excluded when calling
> parse. So the evaluation results should always reflect what your pipeline will
> produce at runtime. If you want a frozen component to run (without updating)
> during training as well, so that downstream components can use its
-> **predictions**, you can add it to the list of
+> **predictions**, you should add it to the list of
> [`annotating_components`](/usage/training#annotating-components).
```ini
diff --git a/website/docs/usage/v3-4.md b/website/docs/usage/v3-4.md
index 7cc4570d5..597fc3cc8 100644
--- a/website/docs/usage/v3-4.md
+++ b/website/docs/usage/v3-4.md
@@ -65,10 +65,10 @@ The English CNN pipelines have new word vectors:
| Package | Model Version | TAG | Parser LAS | NER F |
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
-| [`en_core_news_md`](/models/en#en_core_news_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
-| [`en_core_news_md`](/models/en#en_core_news_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
-| [`en_core_news_lg`](/models/en#en_core_news_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
-| [`en_core_news_lg`](/models/en#en_core_news_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
+| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
+| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
+| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
## Notes about upgrading from v3.3 {#upgrading}
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 79e1fc5d5..bd1535c90 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -4,12 +4,22 @@
"code": "af",
"name": "Afrikaans"
},
+ {
+ "code": "am",
+ "name": "Amharic",
+ "has_examples": true
+ },
{
"code": "ar",
"name": "Arabic",
"example": "هذه جملة",
"has_examples": true
},
+ {
+ "code": "az",
+ "name": "Azerbaijani",
+ "has_examples": true
+ },
{
"code": "bg",
"name": "Bulgarian",
@@ -65,7 +75,7 @@
{
"code": "dsb",
"name": "Lower Sorbian",
- "has_examples": true
+ "has_examples": true
},
{
"code": "el",
@@ -142,6 +152,11 @@
"code": "ga",
"name": "Irish"
},
+ {
+ "code": "grc",
+ "name": "Ancient Greek",
+ "has_examples": true
+ },
{
"code": "gu",
"name": "Gujarati",
@@ -172,7 +187,7 @@
{
"code": "hsb",
"name": "Upper Sorbian",
- "has_examples": true
+ "has_examples": true
},
{
"code": "hu",
@@ -260,6 +275,10 @@
"example": "Адамга эң кыйыны — күн сайын адам болуу",
"has_examples": true
},
+ {
+ "code": "la",
+ "name": "Latin"
+ },
{
"code": "lb",
"name": "Luxembourgish",
@@ -374,8 +393,8 @@
"has_examples": true,
"dependencies": [
{
- "name": "pymorphy2",
- "url": "https://github.com/kmike/pymorphy2"
+ "name": "pymorphy3",
+ "url": "https://github.com/no-plagiarism/pymorphy3"
}
],
"models": [
@@ -448,6 +467,11 @@
"example": "นี่คือประโยค",
"has_examples": true
},
+ {
+ "code": "ti",
+ "name": "Tigrinya",
+ "has_examples": true
+ },
{
"code": "tl",
"name": "Tagalog"
@@ -480,12 +504,12 @@
],
"dependencies": [
{
- "name": "pymorphy2",
- "url": "https://github.com/kmike/pymorphy2"
+ "name": "pymorphy3",
+ "url": "https://github.com/no-plagiarism/pymorphy3"
},
{
- "name": "pymorphy2-dicts-uk",
- "url": "https://github.com/kmike/pymorphy2-dicts/"
+ "name": "pymorphy3-dicts-uk",
+ "url": "https://github.com/no-plagiarism/pymorphy3-dicts"
}
]
},
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 1b743636c..2d8745d77 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -12,7 +12,6 @@
{ "text": "New in v3.0", "url": "/usage/v3" },
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" },
- { "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" }
]
@@ -95,6 +94,7 @@
"label": "Pipeline",
"items": [
{ "text": "AttributeRuler", "url": "/api/attributeruler" },
+ { "text": "CoreferenceResolver", "url": "/api/coref" },
{ "text": "DependencyParser", "url": "/api/dependencyparser" },
{ "text": "EditTreeLemmatizer", "url": "/api/edittreelemmatizer" },
{ "text": "EntityLinker", "url": "/api/entitylinker" },
@@ -105,6 +105,7 @@
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
{ "text": "Sentencizer", "url": "/api/sentencizer" },
{ "text": "SpanCategorizer", "url": "/api/spancategorizer" },
+ { "text": "SpanResolver", "url": "/api/span-resolver" },
{ "text": "SpanRuler", "url": "/api/spanruler" },
{ "text": "Tagger", "url": "/api/tagger" },
{ "text": "TextCategorizer", "url": "/api/textcategorizer" },
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 9145855c6..d7c99956b 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,103 @@
{
"resources": [
+ {
+ "id": "spacy-cleaner",
+ "title": "spacy-cleaner",
+ "slogan": "Easily clean text with spaCy!",
+ "description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
+ "github": "Ce11an/spacy-cleaner",
+ "pip": "spacy-cleaner",
+ "code_example": [
+ "import spacy",
+ "import spacy_cleaner",
+ "from spacy_cleaner.processing import removers, replacers, mutators",
+ "",
+ "model = spacy.load(\"en_core_web_sm\")",
+ "pipeline = spacy_cleaner.Pipeline(",
+ " model,",
+ " removers.remove_stopword_token,",
+ " replacers.replace_punctuation_token,",
+ " mutators.mutate_lemma_token,",
+ ")",
+ "",
+ "texts = [\"Hello, my name is Cellan! I love to swim!\"]",
+ "",
+ "pipeline.clean(texts)",
+ "# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
+ ],
+ "code_language": "python",
+ "url": "https://ce11an.github.io/spacy-cleaner/",
+ "image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
+ "author": "Cellan Hall",
+ "author_links": {
+ "twitter": "Ce11an",
+ "github": "Ce11an",
+ "website": "https://www.linkedin.com/in/cellan-hall/"
+ },
+ "category": [
+ "extension"
+ ],
+ "tags": [
+ "text-processing"
+ ]
+ },
+ {
+ "id": "Zshot",
+ "title": "Zshot",
+ "slogan": "Zero and Few shot named entity & relationships recognition",
+ "github": "ibm/zshot",
+ "pip": "zshot",
+ "code_example": [
+ "import spacy",
+ "from zshot import PipelineConfig, displacy",
+ "from zshot.linker import LinkerRegen",
+ "from zshot.mentions_extractor import MentionsExtractorSpacy",
+ "from zshot.utils.data_models import Entity",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "# zero shot definition of entities",
+ "nlp_config = PipelineConfig(",
+ " mentions_extractor=MentionsExtractorSpacy(),",
+ " linker=LinkerRegen(),",
+ " entities=[",
+ " Entity(name='Paris',",
+ " description='Paris is located in northern central France, in a north-bending arc of the river Seine'),",
+ " Entity(name='IBM',",
+ " description='International Business Machines Corporation (IBM) is an American multinational technology corporation headquartered in Armonk, New York'),",
+ " Entity(name='New York', description='New York is a city in U.S. state'),",
+ " Entity(name='Florida', description='southeasternmost U.S. state'),",
+ " Entity(name='American',",
+ " description='American, something of, from, or related to the United States of America, commonly known as the United States or America'),",
+ " Entity(name='Chemical formula',",
+ " description='In chemistry, a chemical formula is a way of presenting information about the chemical proportions of atoms that constitute a particular chemical compound or molecul'),",
+ " Entity(name='Acetamide',",
+ " description='Acetamide (systematic name: ethanamide) is an organic compound with the formula CH3CONH2. It is the simplest amide derived from acetic acid. It finds some use as a plasticizer and as an industrial solvent.'),",
+ " Entity(name='Armonk',",
+ " description='Armonk is a hamlet and census-designated place (CDP) in the town of North Castle, located in Westchester County, New York, United States.'),",
+ " Entity(name='Acetic Acid',",
+ " description='Acetic acid, systematically named ethanoic acid, is an acidic, colourless liquid and organic compound with the chemical formula CH3COOH'),",
+ " Entity(name='Industrial solvent',",
+ " description='Acetamide (systematic name: ethanamide) is an organic compound with the formula CH3CONH2. It is the simplest amide derived from acetic acid. It finds some use as a plasticizer and as an industrial solvent.'),",
+ " ]",
+ ")",
+ "nlp.add_pipe('zshot', config=nlp_config, last=True)",
+ "",
+ "text = 'International Business Machines Corporation (IBM) is an American multinational technology corporation' \\",
+ " ' headquartered in Armonk, New York, with operations in over 171 countries.'",
+ "",
+ "doc = nlp(text)",
+ "displacy.serve(doc, style='ent')"
+ ],
+ "thumb": "https://ibm.github.io/zshot/img/graph.png",
+ "url": "https://ibm.github.io/zshot/",
+ "author": "IBM Research",
+ "author_links": {
+ "github": "ibm",
+ "twitter": "IBMResearch",
+ "website": "https://research.ibm.com/labs/ireland/"
+ },
+ "category": ["scientific", "models", "research"]
+ },
{
"id": "concepcy",
"title": "concepCy",
@@ -2403,20 +2501,20 @@
"import spacy",
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
"",
- "# Load an spacy model (supported models are \"es\" and \"en\") ",
- "nlp = spacy.load('en')",
- "# Spacy 3.x",
- "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
- "# Spacy 2.x",
+ "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
+ "nlp = spacy.load('en_core_web_sm')",
+ "# spaCy 3.x",
+ "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
+ "# spaCy 2.x",
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
"token = nlp('prices')[0]",
"",
- "# wordnet object link spacy token with nltk wordnet interface by giving acces to",
+ "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
"# synsets and lemmas ",
"token._.wordnet.synsets()",
"token._.wordnet.lemmas()",
"",
- "# And automatically tags with wordnet domains",
+ "# And automatically add info about WordNet domains",
"token._.wordnet.wordnet_domains()"
],
"author": "recognai",
@@ -3984,7 +4082,21 @@
},
"category": ["pipeline"],
"tags": ["interpretation", "ja"]
+ },
+ {
+ "id": "spacy-partial-tagger",
+ "title": "spaCy - Partial Tagger",
+ "slogan": "Sequence Tagger for Partially Annotated Dataset in spaCy",
+ "description": "This is a library to build a CRF tagger with a partially annotated dataset in spaCy. You can build your own tagger only from dictionary.",
+ "github": "doccano/spacy-partial-tagger",
+ "pip": "spacy-partial-tagger",
+ "category": ["pipeline", "training"],
+ "author": "Yasufumi Taniguchi",
+ "author_links": {
+ "github": "yasufumy"
+ }
}
+
],
"categories": [