diff --git a/Makefile b/Makefile
index a4df0f8c8..3f10e79cc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
SHELL := /bin/bash
ifndef SPACY_EXTRAS
-override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
+override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
endif
ifndef PYVER
diff --git a/pyproject.toml b/pyproject.toml
index 611a95d27..d48886e0c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a43,<8.0.0a50",
- "blis>=0.4.0,<0.5.0",
+ "blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy"
]
diff --git a/requirements.txt b/requirements.txt
index 44dad38e3..29695e9b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50
-blis>=0.4.0,<0.5.0
+blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 7192ba9d4..e77bda2fc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,7 @@ install_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50
- blis>=0.4.0,<0.5.0
+ blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0
@@ -92,6 +92,8 @@ ko =
natto-py==0.9.0
th =
pythainlp>=2.0
+zh =
+ spacy-pkuseg==0.0.26
[bdist_wheel]
universal = false
diff --git a/spacy/about.py b/spacy/about.py
index 037ca6bcb..373d1d2b0 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,7 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
-__version__ = "3.0.0a32"
-__release__ = True
+__version__ = "3.0.0a34"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c959c9861..373650172 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
+from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
import sys
import shutil
from pathlib import Path
@@ -16,7 +16,8 @@ import os
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
-from ..util import ENV_VARS
+from ..util import is_compatible_version, ENV_VARS
+from .. import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
msg.fail(invalid_err)
print("\n".join(errors))
sys.exit(1)
+ validate_project_version(config)
validate_project_commands(config)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
@@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
return dict(interpolated["project"])
+def validate_project_version(config: Dict[str, Any]) -> None:
+ """If the project defines a compatible spaCy version range, chec that it's
+ compatible with the current version of spaCy.
+
+ config (Dict[str, Any]): The loaded config.
+ """
+ spacy_version = config.get("spacy_version", None)
+ if spacy_version and not is_compatible_version(about.__version__, spacy_version):
+ err = (
+ f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
+ f"that's not compatible with the version of spaCy you're running "
+ f"({about.__version__}). You can edit version requirement in the "
+ f"{PROJECT_FILE} to load it, but the project may not run as expected."
+ )
+ msg.fail(err, exits=1)
+
+
def validate_project_commands(config: Dict[str, Any]) -> None:
"""Check that project commands and workflows are valid, don't contain
duplicates, don't clash and only refer to commands that exist.
@@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
)
-def get_hash(data) -> str:
+def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
"""Get the hash for a JSON-serializable object.
data: The data to hash.
+ exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
RETURNS (str): The hash.
"""
+ if isinstance(data, dict):
+ data = {k: v for k, v in data.items() if k not in exclude}
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest()
diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py
index e7e7cbbe8..6056458e2 100644
--- a/spacy/cli/project/remote_storage.py
+++ b/spacy/cli/project/remote_storage.py
@@ -7,7 +7,9 @@ import tarfile
from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy
-from ...util import make_tempdir
+from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
+from ...git_info import GIT_VERSION
+from ... import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@@ -129,7 +131,10 @@ def get_command_hash(
currently installed packages, whatever environment variables have been marked
as relevant, and the command.
"""
- hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
+ check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
+ spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
+ dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
+ hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest()
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index 69c49fba7..1a9b447ea 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -4,8 +4,11 @@ from wasabi import msg
import sys
import srsly
+from ... import about
+from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command
-from ...util import SimpleFrozenList
+from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
+from ...util import check_bool_env_var
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@@ -62,12 +65,13 @@ def project_run(
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, err_help, **err_kwargs)
+ check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
- rerun = check_rerun(current_dir, cmd)
+ msg.divider(subcommand)
+ rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed")
else:
- msg.divider(subcommand)
run_commands(cmd["script"], dry=dry)
if not dry:
update_lockfile(current_dir, cmd)
@@ -171,12 +175,19 @@ def validate_subcommand(
)
-def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
+def check_rerun(
+ project_dir: Path,
+ command: Dict[str, Any],
+ *,
+ check_spacy_version: bool = True,
+ check_spacy_commit: bool = False,
+) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs
changed.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
+ strict_version (bool):
RETURNS (bool): Whether to re-run the command.
"""
lock_path = project_dir / PROJECT_LOCK
@@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
# Always run commands with no outputs (otherwise they'd always be skipped)
if not entry.get("outs", []):
return True
+ # Always rerun if spaCy version or commit hash changed
+ spacy_v = entry.get("spacy_version")
+ commit = entry.get("spacy_git_version")
+ if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
+ info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
+ msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
+ return True
+ if check_spacy_commit and commit != GIT_VERSION:
+ info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
+ msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
+ return True
# If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed
- return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
+ lock_entry = get_lock_entry(project_dir, command)
+ exclude = ["spacy_version", "spacy_git_version"]
+ return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
@@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
"script": command["script"],
"deps": deps,
"outs": [*outs, *outs_nc],
+ "spacy_version": about.__version__,
+ "spacy_git_version": GIT_VERSION,
}
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 3bd237b0a..d92de9c15 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -171,9 +171,14 @@ factory = "tok2vec"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
-rows = {{ 2000 if optimize == "efficiency" else 7000 }}
-also_embed_subwords = {{ "true" if has_letters else "false" }}
-also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
+{% if has_letters -%}
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 2500, 2500, 2500]
+{% else -%}
+attrs = ["ORTH", "SHAPE"]
+rows = [5000, 2500]
+{% endif -%}
+include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/errors.py b/spacy/errors.py
index 20edf45b5..bf3628ce9 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -456,10 +456,14 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master
- E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
+ E901 = ("Failed to remove existing output directory: {path}. If your "
+ "config and the components you train change between runs, a "
+ "non-empty output directory can lead to stale pipeline data. To "
+ "solve this, remove the existing directories in the output directory.")
+ E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
"Try checking whitespace and delimiters. See "
"https://nightly.spacy.io/api/cli#convert")
- E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
+ E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
"dimension refers to the output width, after the linear projection "
diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py
index 1d59ca043..2f3965fcc 100644
--- a/spacy/lang/ru/__init__.py
+++ b/spacy/lang/ru/__init__.py
@@ -25,8 +25,14 @@ class Russian(Language):
default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
- return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
+def make_lemmatizer(
+ nlp: Language,
+ model: Optional[Model],
+ name: str,
+ mode: str,
+ overwrite: bool = False,
+):
+ return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Russian"]
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 8d7996c63..3bcac8730 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
from thinc.api import Model
-from ...lookups import Lookups
from ...pipeline import Lemmatizer
from ...symbols import POS
from ...tokens import Token
@@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
- lookups: Optional[Lookups] = None,
+ overwrite: bool = False,
) -> None:
- super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+ super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
try:
from pymorphy2 import MorphAnalyzer
diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py
index 73c065379..24c88e5a7 100644
--- a/spacy/lang/uk/__init__.py
+++ b/spacy/lang/uk/__init__.py
@@ -26,8 +26,10 @@ class Ukrainian(Language):
default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
-def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
- return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
+def make_lemmatizer(
+ nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
+):
+ return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Ukrainian"]
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 0d6febce6..009ec5044 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -3,7 +3,6 @@ from typing import Optional
from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer
-from ...lookups import Lookups
from ...vocab import Vocab
@@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
- lookups: Optional[Lookups] = None,
+ overwrite: bool = False,
) -> None:
- super().__init__(vocab, model, name, mode=mode, lookups=lookups)
+ super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 55a77330a..30560ed0d 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -17,8 +17,7 @@ from ... import util
# fmt: off
-_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
-_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
+_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
# fmt: on
DEFAULT_CONFIG = """
@@ -55,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
class ChineseTokenizer(DummyTokenizer):
- def __init__(
- self, nlp: Language, segmenter: Segmenter = Segmenter.char,
- ):
+ def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
self.vocab = nlp.vocab
if isinstance(segmenter, Segmenter):
segmenter = segmenter.value
@@ -82,11 +79,13 @@ class ChineseTokenizer(DummyTokenizer):
*,
nlp: Optional[Language] = None,
pkuseg_model: Optional[str] = None,
- pkuseg_user_dict: str = "default",
+ pkuseg_user_dict: Optional[str] = "default",
):
if self.segmenter == Segmenter.pkuseg:
+ if pkuseg_user_dict is None:
+ pkuseg_user_dict = pkuseg_model
self.pkuseg_seg = try_pkuseg_import(
- pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
+ pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
)
def __call__(self, text: str) -> Doc:
@@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
if self.segmenter == Segmenter.pkuseg:
if reset:
try:
- import pkuseg
+ import spacy_pkuseg
- self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
+ self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
except ImportError:
msg = (
- "pkuseg not installed: unable to reset pkuseg "
+ "spacy_pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg) from None
@@ -156,23 +155,7 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.feature_extractor.save(tempdir)
self.pkuseg_seg.model.save(tempdir)
tempdir = Path(tempdir)
- # pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
- # means that it will be saved with pickle protocol 5 with
- # python 3.8, which can't be reloaded with python 3.6-3.7.
- # To try to make the model compatible with python 3.6+, reload
- # the data with pickle5 and convert it back to protocol 4.
- try:
- import pickle5
-
- with open(tempdir / "features.pkl", "rb") as fileh:
- features = pickle5.load(fileh)
- with open(tempdir / "features.pkl", "wb") as fileh:
- pickle5.dump(features, fileh, protocol=4)
- except ImportError as e:
- raise e
- except Exception:
- warnings.warn(_PKUSEG_PICKLE_WARNING)
- with open(tempdir / "features.pkl", "rb") as fileh:
+ with open(tempdir / "features.msgpack", "rb") as fileh:
pkuseg_features_b = fileh.read()
with open(tempdir / "weights.npz", "rb") as fileh:
pkuseg_weights_b = fileh.read()
@@ -213,22 +196,22 @@ class ChineseTokenizer(DummyTokenizer):
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir)
- with open(tempdir / "features.pkl", "wb") as fileh:
+ with open(tempdir / "features.msgpack", "wb") as fileh:
fileh.write(pkuseg_data["features_b"])
with open(tempdir / "weights.npz", "wb") as fileh:
fileh.write(pkuseg_data["weights_b"])
try:
- import pkuseg
+ import spacy_pkuseg
except ImportError:
raise ImportError(
- "pkuseg not installed. To use this model, "
+ "spacy-pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
) from None
- self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
+ self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
if pkuseg_data["processors_data"]:
processors_data = pkuseg_data["processors_data"]
(user_dict, do_process, common_words, other_words) = processors_data
- self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
+ self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
@@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
path.mkdir(parents=True)
self.pkuseg_seg.model.save(path)
self.pkuseg_seg.feature_extractor.save(path)
- # try to convert features.pkl to pickle protocol 4
- try:
- import pickle5
-
- with open(path / "features.pkl", "rb") as fileh:
- features = pickle5.load(fileh)
- with open(path / "features.pkl", "wb") as fileh:
- pickle5.dump(features, fileh, protocol=4)
- except ImportError as e:
- raise e
- except Exception:
- warnings.warn(_PKUSEG_PICKLE_WARNING)
def save_pkuseg_processors(path):
if self.pkuseg_seg:
@@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
def load_pkuseg_model(path):
try:
- import pkuseg
+ import spacy_pkuseg
except ImportError:
if self.segmenter == Segmenter.pkuseg:
raise ImportError(
- "pkuseg not installed. To use this model, "
+ "spacy-pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
) from None
if path.exists():
- self.pkuseg_seg = pkuseg.pkuseg(path)
+ self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
def load_pkuseg_processors(path):
try:
- import pkuseg
+ import spacy_pkuseg
except ImportError:
if self.segmenter == Segmenter.pkuseg:
raise ImportError(self._pkuseg_install_msg) from None
if self.segmenter == Segmenter.pkuseg:
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
- self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
+ self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
@@ -341,12 +312,13 @@ def try_jieba_import() -> None:
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
try:
- import pkuseg
+ import spacy_pkuseg
- return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except ImportError:
- msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
+ msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None
+ try:
+ return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except FileNotFoundError:
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) from None
diff --git a/spacy/lookups.py b/spacy/lookups.py
index fb5e3d748..133cb0672 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -289,13 +289,12 @@ class Lookups:
DOCS: https://nightly.spacy.io/api/lookups#to_disk
"""
- if len(self._tables):
- path = ensure_path(path)
- if not path.exists():
- path.mkdir()
- filepath = path / filename
- with filepath.open("wb") as file_:
- file_.write(self.to_bytes())
+ path = ensure_path(path)
+ if not path.exists():
+ path.mkdir()
+ filepath = path / filename
+ with filepath.open("wb") as file_:
+ file_.write(self.to_bytes())
def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 1a0979cab..23cfe883b 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -11,7 +11,7 @@ from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
+from ...attrs import intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1")
@@ -29,7 +29,7 @@ def build_hash_embed_cnn_tok2vec(
window_size: int,
maxout_pieces: int,
subword_features: bool,
- pretrained_vectors: Optional[bool]
+ pretrained_vectors: Optional[bool],
) -> Model[List[Doc], List[Floats2d]]:
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
with subword features and a CNN with layer-normalized maxout.
@@ -54,12 +54,18 @@ def build_hash_embed_cnn_tok2vec(
a language such as Chinese.
pretrained_vectors (bool): Whether to also use static vectors.
"""
+ if subword_features:
+ attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+ row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2]
+ else:
+ attrs = ["NORM"]
+ row_sizes = [embed_size]
return build_Tok2Vec_model(
embed=MultiHashEmbed(
width=width,
- rows=embed_size,
- also_embed_subwords=subword_features,
- also_use_static_vectors=bool(pretrained_vectors),
+ rows=row_sizes,
+ attrs=attrs,
+ include_static_vectors=bool(pretrained_vectors),
),
encode=MaxoutWindowEncoder(
width=width,
@@ -93,58 +99,59 @@ def build_Tok2Vec_model(
@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(
- width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
+ width: int,
+ attrs: List[Union[str, int]],
+ rows: List[int],
+ include_static_vectors: bool,
) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
- The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
- varying definitions depending on the Vocab of the Doc object passed in.
- Vectors from pretrained static vectors can also be incorporated into the
- concatenated representation.
+ The features used can be configured with the 'attrs' argument. The suggested
+ attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
+ account some subword information, without construction a fully character-based
+ representation. If pretrained vectors are available, they can be included in
+ the representation as well, with the vectors table will be kept static
+ (i.e. it's not updated).
+
+ The `width` parameter specifies the output width of the layer and the widths
+ of all embedding tables. If static vectors are included, a learned linear
+ layer is used to map the vectors to the specified width before concatenating
+ it with the other embedding outputs. A single Maxout layer is then used to
+ reduce the concatenated vectors to the final width.
+
+ The `rows` parameter controls the number of rows used by the `HashEmbed`
+ tables. The HashEmbed layer needs surprisingly few rows, due to its use of
+ the hashing trick. Generally between 2000 and 10000 rows is sufficient,
+ even for very large vocabularies. A number of rows must be specified for each
+ table, so the `rows` list must be of the same length as the `attrs` parameter.
width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300.
- rows (int): The number of rows for the embedding tables. Can be low, due
- to the hashing trick. Embeddings for prefix, suffix and word shape
- use half as many rows. Recommended values are between 2000 and 10000.
- also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
- features in the embeddings. If not using these, you may need more
- rows in your hash embeddings, as there will be increased chance of
- collisions.
- also_use_static_vectors (bool): Whether to also use static word vectors.
+ attrs (list of attr IDs): The token attributes to embed. A separate
+ embedding table will be constructed for each attribute.
+ rows (List[int]): The number of rows in the embedding tables. Must have the
+ same length as attrs.
+ include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
- cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
+ if len(rows) != len(attrs):
+ raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
seed = 7
- def make_hash_embed(feature):
+ def make_hash_embed(index):
nonlocal seed
seed += 1
- return HashEmbed(
- width,
- rows if feature == LOWER else rows // 2,
- column=cols.index(feature),
- seed=seed,
- dropout=0.0,
- )
+ return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0)
- if also_embed_subwords:
- embeddings = [
- make_hash_embed(LOWER),
- make_hash_embed(PREFIX),
- make_hash_embed(SUFFIX),
- make_hash_embed(SHAPE),
- ]
- else:
- embeddings = [make_hash_embed(LOWER)]
- concat_size = width * (len(embeddings) + also_use_static_vectors)
- if also_use_static_vectors:
+ embeddings = [make_hash_embed(i) for i in range(len(attrs))]
+ concat_size = width * (len(embeddings) + include_static_vectors)
+ if include_static_vectors:
model = chain(
concatenate(
chain(
- FeatureExtractor(cols),
+ FeatureExtractor(attrs),
list2ragged(),
with_array(concatenate(*embeddings)),
),
@@ -155,7 +162,7 @@ def MultiHashEmbed(
)
else:
model = chain(
- FeatureExtractor(cols),
+ FeatureExtractor(list(attrs)),
list2ragged(),
with_array(concatenate(*embeddings)),
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 82f3bf37d..6d97b062f 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -210,7 +210,7 @@ class Morphologizer(Tagger):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
- RETUTNRS (Tuple[float, float]): The loss and the gradient.
+ RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
"""
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 41ca23ace..8e103a638 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -162,7 +162,7 @@ cdef class Pipe:
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
- RETUTNRS (Tuple[float, float]): The loss and the gradient.
+ RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/pipe#get_loss
"""
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 0bfef7c7b..8fb1e664f 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
- RETUTNRS (Tuple[float, float]): The loss and the gradient.
+ RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
"""
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 6cb582b36..94ac0c082 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -249,7 +249,7 @@ class Tagger(Pipe):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
- RETUTNRS (Tuple[float, float]): The loss and the gradient.
+ RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/tagger#get_loss
"""
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index fc60ebf89..292598e3a 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
- RETUTNRS (Tuple[float, float]): The loss and the gradient.
+ RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
"""
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 591b7e134..0d88d4090 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
title: Optional[str] = Field(None, title="Project title")
+ spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
# fmt: on
class Config:
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 4a3d126d7..411397b42 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -248,7 +248,6 @@ def tt_tokenizer():
@pytest.fixture(scope="session")
def uk_tokenizer():
pytest.importorskip("pymorphy2")
- pytest.importorskip("pymorphy2.lang")
return get_lang_class("uk")().tokenizer
@@ -285,8 +284,7 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
def zh_tokenizer_pkuseg():
- pytest.importorskip("pkuseg")
- pytest.importorskip("pickle5")
+ pytest.importorskip("spacy_pkuseg")
config = {
"nlp": {
"tokenizer": {
@@ -296,7 +294,7 @@ def zh_tokenizer_pkuseg():
},
"initialize": {
"tokenizer": {
- "pkuseg_model": "default",
+ "pkuseg_model": "web",
}
},
}
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index da4a46a47..30f945165 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab):
# Retokenize to split out the words in the token at doc[2].
token = doc[2]
with doc.retokenize() as retokenizer:
- retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
+ retokenizer.split(
+ token,
+ ["brown", "fox", "jumps", "over", "the"],
+ heads=[(token, idx) for idx in range(5)],
+ )
- assert doc[9].text == "w/"
+ assert doc[9].text == "w/"
assert doc[9].norm_ == "with"
- assert doc[5].text == "over"
+ assert doc[5].text == "over"
assert doc[5].norm_ == "over"
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index a4297a1d1..4b96992e1 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -350,7 +350,7 @@ def test_pipe_methods_frozen():
@pytest.mark.parametrize(
- "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
+ "pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"]
)
def test_pipe_label_data_exports_labels(pipe):
nlp = Language()
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 06212e351..90882ae3f 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -24,9 +24,9 @@ def test_empty_doc():
tok2vec = build_Tok2Vec_model(
MultiHashEmbed(
width=width,
- rows=embed_size,
- also_use_static_vectors=False,
- also_embed_subwords=True,
+ rows=[embed_size, embed_size, embed_size, embed_size],
+ include_static_vectors=False,
+ attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
),
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
@@ -44,9 +44,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
tok2vec = build_Tok2Vec_model(
MultiHashEmbed(
width=width,
- rows=embed_size,
- also_use_static_vectors=False,
- also_embed_subwords=True,
+ rows=[embed_size] * 4,
+ include_static_vectors=False,
+ attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
),
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
@@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
@pytest.mark.parametrize(
"width,embed_arch,embed_config,encode_arch,encode_config",
[
- (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
- (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
+ (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
+ (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
],
@@ -118,9 +118,9 @@ cfg_string = """
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
- rows = 2000
- also_embed_subwords = true
- also_use_static_vectors = false
+ rows = [2000, 1000, 1000, 1000]
+ attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+ include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
diff --git a/spacy/tests/regression/test_issue5918.py b/spacy/tests/regression/test_issue5918.py
index db957709c..d25323ef6 100644
--- a/spacy/tests/regression/test_issue5918.py
+++ b/spacy/tests/regression/test_issue5918.py
@@ -1,6 +1,5 @@
from spacy.lang.en import English
from spacy.pipeline import merge_entities
-import pytest
def test_issue5918():
@@ -23,7 +22,8 @@ def test_issue5918():
assert len(doc.ents) == 3
# make it so that the third span's head is within the entity (ent_iob=I)
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
- with pytest.warns(UserWarning):
- doc[29].head = doc[33]
+ # TODO: test for logging here
+ # with pytest.warns(UserWarning):
+ # doc[29].head = doc[33]
doc = merge_entities(doc)
assert len(doc.ents) == 3
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index da048f3d6..8b3f5c2b8 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -89,9 +89,9 @@ def my_parser():
tok2vec = build_Tok2Vec_model(
MultiHashEmbed(
width=321,
- rows=5432,
- also_embed_subwords=True,
- also_use_static_vectors=False,
+ attrs=["LOWER", "SHAPE"],
+ rows=[5432, 5432],
+ include_static_vectors=False,
),
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index e6ef45f90..b9a0a9d05 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -7,6 +7,15 @@ from spacy import util
from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
+from spacy.util import dot_to_object, SimpleFrozenList
+from thinc.api import Config, Optimizer, ConfigValidationError
+from spacy.training.batchers import minibatch_by_words
+from spacy.lang.en import English
+from spacy.lang.nl import Dutch
+from spacy.language import DEFAULT_CONFIG_PATH
+from spacy.schemas import ConfigSchemaTraining
+
+from .util import get_random_doc
@pytest.fixture
@@ -140,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
assert util.is_unconstrained_version(constraint) is expected
+@pytest.mark.parametrize(
+ "a1,a2,b1,b2,is_match",
+ [
+ ("3.0.0", "3.0", "3.0.1", "3.0", True),
+ ("3.1.0", "3.1", "3.2.1", "3.2", False),
+ ("xxx", None, "1.2.3.dev0", "1.2", False),
+ ],
+)
+def test_minor_version(a1, a2, b1, b2, is_match):
+ assert util.get_minor_version(a1) == a2
+ assert util.get_minor_version(b1) == b2
+ assert util.is_minor_version_match(a1, b1) is is_match
+ assert util.is_minor_version_match(a2, b2) is is_match
+
+
@pytest.mark.parametrize(
"dot_notation,expected",
[
@@ -157,3 +181,128 @@ def test_dot_to_dict(dot_notation, expected):
result = util.dot_to_dict(dot_notation)
assert result == expected
assert util.dict_to_dot(result) == dot_notation
+
+
+@pytest.mark.parametrize(
+ "doc_sizes, expected_batches",
+ [
+ ([400, 400, 199], [3]),
+ ([400, 400, 199, 3], [4]),
+ ([400, 400, 199, 3, 200], [3, 2]),
+ ([400, 400, 199, 3, 1], [5]),
+ ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
+ ([400, 400, 199, 3, 1, 200], [3, 3]),
+ ([400, 400, 199, 3, 1, 999], [3, 3]),
+ ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
+ ([1, 2, 999], [3]),
+ ([1, 2, 999, 1], [4]),
+ ([1, 200, 999, 1], [2, 2]),
+ ([1, 999, 200, 1], [2, 2]),
+ ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+ docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+ tol = 0.2
+ batch_size = 1000
+ batches = list(
+ minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
+ )
+ assert [len(batch) for batch in batches] == expected_batches
+
+ max_size = batch_size + batch_size * tol
+ for batch in batches:
+ assert sum([len(doc) for doc in batch]) < max_size
+
+
+@pytest.mark.parametrize(
+ "doc_sizes, expected_batches",
+ [
+ ([400, 4000, 199], [1, 2]),
+ ([400, 400, 199, 3000, 200], [1, 4]),
+ ([400, 400, 199, 3, 1, 1500], [1, 5]),
+ ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
+ ([1, 2, 9999], [1, 2]),
+ ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
+ ],
+)
+def test_util_minibatch_oversize(doc_sizes, expected_batches):
+ """ Test that oversized documents are returned in their own batch"""
+ docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+ tol = 0.2
+ batch_size = 1000
+ batches = list(
+ minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
+ )
+ assert [len(batch) for batch in batches] == expected_batches
+
+
+def test_util_dot_section():
+ cfg_string = """
+ [nlp]
+ lang = "en"
+ pipeline = ["textcat"]
+
+ [components]
+
+ [components.textcat]
+ factory = "textcat"
+
+ [components.textcat.model]
+ @architectures = "spacy.TextCatBOW.v1"
+ exclusive_classes = true
+ ngram_size = 1
+ no_output_layer = false
+ """
+ nlp_config = Config().from_str(cfg_string)
+ en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
+ default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
+ default_config["nlp"]["lang"] = "nl"
+ nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
+ # Test that creation went OK
+ assert isinstance(en_nlp, English)
+ assert isinstance(nl_nlp, Dutch)
+ assert nl_nlp.pipe_names == []
+ assert en_nlp.pipe_names == ["textcat"]
+ # not exclusive_classes
+ assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
+ # Test that default values got overwritten
+ assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
+ assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
+ # Test proper functioning of 'dot_to_object'
+ with pytest.raises(KeyError):
+ dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
+ with pytest.raises(KeyError):
+ dot_to_object(en_nlp.config, "nlp.unknownattribute")
+ T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
+ assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
+
+
+def test_simple_frozen_list():
+ t = SimpleFrozenList(["foo", "bar"])
+ assert t == ["foo", "bar"]
+ assert t.index("bar") == 1 # okay method
+ with pytest.raises(NotImplementedError):
+ t.append("baz")
+ with pytest.raises(NotImplementedError):
+ t.sort()
+ with pytest.raises(NotImplementedError):
+ t.extend(["baz"])
+ with pytest.raises(NotImplementedError):
+ t.pop()
+ t = SimpleFrozenList(["foo", "bar"], error="Error!")
+ with pytest.raises(NotImplementedError):
+ t.append("baz")
+
+
+def test_resolve_dot_names():
+ config = {
+ "training": {"optimizer": {"@optimizers": "Adam.v1"}},
+ "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
+ }
+ result = util.resolve_dot_names(config, ["training.optimizer"])
+ assert isinstance(result[0], Optimizer)
+ with pytest.raises(ConfigValidationError) as e:
+ util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
+ errors = e.value.errors
+ assert len(errors) == 1
+ assert errors[0]["loc"] == ["training", "xyz"]
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index a123f459d..17408f7e8 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -61,7 +61,10 @@ def get_tok2vec_kwargs():
# This actually creates models, so seems best to put it in a function.
return {
"embed": MultiHashEmbed(
- width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
+ width=32,
+ rows=[500, 500, 500],
+ attrs=["NORM", "PREFIX", "SHAPE"],
+ include_static_vectors=False,
),
"encode": MaxoutWindowEncoder(
width=32, depth=2, maxout_pieces=2, window_size=1
@@ -73,6 +76,32 @@ def test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs())
+def test_multi_hash_embed():
+ embed = MultiHashEmbed(
+ width=32,
+ rows=[500, 500, 500],
+ attrs=["NORM", "PREFIX", "SHAPE"],
+ include_static_vectors=False,
+ )
+ hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
+ assert len(hash_embeds) == 3
+ # Check they look at different columns.
+ assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
+ # Check they use different seeds
+ assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
+ # Check they all have the same number of rows
+ assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
+ # Now try with different row factors
+ embed = MultiHashEmbed(
+ width=32,
+ rows=[1000, 50, 250],
+ attrs=["NORM", "PREFIX", "SHAPE"],
+ include_static_vectors=False,
+ )
+ hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
+ assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
+
+
@pytest.mark.parametrize(
"seed,model_func,kwargs",
[
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
deleted file mode 100644
index f710a38eb..000000000
--- a/spacy/tests/test_util.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import pytest
-
-from spacy import util
-from spacy.util import dot_to_object, SimpleFrozenList
-from thinc.api import Config, Optimizer, ConfigValidationError
-from spacy.training.batchers import minibatch_by_words
-from spacy.lang.en import English
-from spacy.lang.nl import Dutch
-from spacy.language import DEFAULT_CONFIG_PATH
-from spacy.schemas import ConfigSchemaTraining
-
-from .util import get_random_doc
-
-
-@pytest.mark.parametrize(
- "doc_sizes, expected_batches",
- [
- ([400, 400, 199], [3]),
- ([400, 400, 199, 3], [4]),
- ([400, 400, 199, 3, 200], [3, 2]),
- ([400, 400, 199, 3, 1], [5]),
- ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
- ([400, 400, 199, 3, 1, 200], [3, 3]),
- ([400, 400, 199, 3, 1, 999], [3, 3]),
- ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
- ([1, 2, 999], [3]),
- ([1, 2, 999, 1], [4]),
- ([1, 200, 999, 1], [2, 2]),
- ([1, 999, 200, 1], [2, 2]),
- ],
-)
-def test_util_minibatch(doc_sizes, expected_batches):
- docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
- tol = 0.2
- batch_size = 1000
- batches = list(
- minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
- )
- assert [len(batch) for batch in batches] == expected_batches
-
- max_size = batch_size + batch_size * tol
- for batch in batches:
- assert sum([len(doc) for doc in batch]) < max_size
-
-
-@pytest.mark.parametrize(
- "doc_sizes, expected_batches",
- [
- ([400, 4000, 199], [1, 2]),
- ([400, 400, 199, 3000, 200], [1, 4]),
- ([400, 400, 199, 3, 1, 1500], [1, 5]),
- ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
- ([1, 2, 9999], [1, 2]),
- ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
- ],
-)
-def test_util_minibatch_oversize(doc_sizes, expected_batches):
- """ Test that oversized documents are returned in their own batch"""
- docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
- tol = 0.2
- batch_size = 1000
- batches = list(
- minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
- )
- assert [len(batch) for batch in batches] == expected_batches
-
-
-def test_util_dot_section():
- cfg_string = """
- [nlp]
- lang = "en"
- pipeline = ["textcat"]
-
- [components]
-
- [components.textcat]
- factory = "textcat"
-
- [components.textcat.model]
- @architectures = "spacy.TextCatBOW.v1"
- exclusive_classes = true
- ngram_size = 1
- no_output_layer = false
- """
- nlp_config = Config().from_str(cfg_string)
- en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
- default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
- default_config["nlp"]["lang"] = "nl"
- nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
- # Test that creation went OK
- assert isinstance(en_nlp, English)
- assert isinstance(nl_nlp, Dutch)
- assert nl_nlp.pipe_names == []
- assert en_nlp.pipe_names == ["textcat"]
- # not exclusive_classes
- assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
- # Test that default values got overwritten
- assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
- assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
- # Test proper functioning of 'dot_to_object'
- with pytest.raises(KeyError):
- dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
- with pytest.raises(KeyError):
- dot_to_object(en_nlp.config, "nlp.unknownattribute")
- T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
- assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
-
-
-def test_simple_frozen_list():
- t = SimpleFrozenList(["foo", "bar"])
- assert t == ["foo", "bar"]
- assert t.index("bar") == 1 # okay method
- with pytest.raises(NotImplementedError):
- t.append("baz")
- with pytest.raises(NotImplementedError):
- t.sort()
- with pytest.raises(NotImplementedError):
- t.extend(["baz"])
- with pytest.raises(NotImplementedError):
- t.pop()
- t = SimpleFrozenList(["foo", "bar"], error="Error!")
- with pytest.raises(NotImplementedError):
- t.append("baz")
-
-
-def test_resolve_dot_names():
- config = {
- "training": {"optimizer": {"@optimizers": "Adam.v1"}},
- "foo": {"bar": "training.optimizer", "baz": "training.xyz"},
- }
- result = util.resolve_dot_names(config, ["training.optimizer"])
- assert isinstance(result[0], Optimizer)
- with pytest.raises(ConfigValidationError) as e:
- util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
- errors = e.value.errors
- assert len(errors) == 1
- assert errors[0]["loc"] == ["training", "xyz"]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 3404274ce..0499dc4a7 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1528,7 +1528,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
while not heads_within_sents:
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
if loop_count > 10:
- warnings.warn(Warnings.W026)
+ util.logger.debug(Warnings.W026)
break
loop_count += 1
# Set sentence starts
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index e6d10a195..13ae45bd2 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -5,7 +5,7 @@ import copy
from functools import partial
from pydantic import BaseModel, StrictStr
-from ..util import registry, logger
+from ..util import registry
from ..tokens import Doc
from .example import Example
@@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
def lower_casing_augmenter(
- nlp: "Language", example: Example, *, level: float,
+ nlp: "Language", example: Example, *, level: float
) -> Iterator[Example]:
if random.random() >= level:
yield example
@@ -119,9 +119,8 @@ def make_orth_variants(
orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", [])
- logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
- words = token_dict.get("words", [])
- tags = token_dict.get("tags", [])
+ words = token_dict.get("ORTH", [])
+ tags = token_dict.get("TAG", [])
# keep unmodified if words or tags are not defined
if words and tags:
if lower:
@@ -154,8 +153,8 @@ def make_orth_variants(
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
- token_dict["words"] = words
- token_dict["tags"] = tags
+ token_dict["ORTH"] = words
+ token_dict["TAG"] = tags
# modify raw
if raw is not None:
variants = []
diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 28f0f87c3..c01686aee 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -103,7 +103,7 @@ def conll_ner_to_docs(
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2:
- raise ValueError(Errors.E093)
+ raise ValueError(Errors.E903)
length = len(cols[0])
words.extend(cols[0])
sent_starts.extend([True] + [False] * (length - 1))
diff --git a/spacy/training/converters/iob_to_docs.py b/spacy/training/converters/iob_to_docs.py
index 73ad8953d..a2185fef7 100644
--- a/spacy/training/converters/iob_to_docs.py
+++ b/spacy/training/converters/iob_to_docs.py
@@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
sent_words, sent_iob = zip(*sent_tokens)
sent_tags = ["-"] * len(sent_words)
else:
- raise ValueError(Errors.E092)
+ raise ValueError(Errors.E902)
words.extend(sent_words)
tags.extend(sent_tags)
iob.extend(sent_iob)
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 0d4414964..67f61567e 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING
from pathlib import Path
from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
+from wasabi import Printer
import random
-import wasabi
import sys
+import shutil
from .example import Example
from ..schemas import ConfigSchemaTraining
from ..errors import Errors
-from ..util import resolve_dot_names, registry
+from ..util import resolve_dot_names, registry, logger
if TYPE_CHECKING:
from ..language import Language # noqa: F401
+DIR_MODEL_BEST = "model-best"
+DIR_MODEL_LAST = "model-last"
+
+
def train(
nlp: "Language",
output_path: Optional[Path] = None,
@@ -38,7 +43,7 @@ def train(
RETURNS (Path / None): The path to the final exported model.
"""
# We use no_print here so we can respect the stdout/stderr options.
- msg = wasabi.Printer(no_print=True)
+ msg = Printer(no_print=True)
# Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate()
if config["training"]["seed"] is not None:
@@ -69,6 +74,7 @@ def train(
eval_frequency=T["eval_frequency"],
exclude=frozen_components,
)
+ clean_output_dir(output_path)
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
if frozen_components:
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
@@ -83,7 +89,7 @@ def train(
update_meta(T, nlp, info)
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
- nlp.to_disk(output_path / "model-best")
+ nlp.to_disk(output_path / DIR_MODEL_BEST)
except Exception as e:
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
@@ -100,7 +106,7 @@ def train(
finally:
finalize_logger()
if output_path is not None:
- final_model_path = output_path / "model-last"
+ final_model_path = output_path / DIR_MODEL_LAST
if optimizer.averages:
with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path)
@@ -305,3 +311,19 @@ def create_before_to_disk_callback(
return modified_nlp
return before_to_disk
+
+
+def clean_output_dir(path: Union[str, Path]) -> None:
+ """Remove an existing output directory. Typically used to ensure that that
+ a directory like model-best and its contents aren't just being overwritten
+ by nlp.to_disk, which could preserve existing subdirectories (e.g.
+ components that don't exist anymore).
+ """
+ if path is not None and path.exists():
+ for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]:
+ if subdir.exists():
+ try:
+ shutil.rmtree(str(subdir))
+ logger.debug(f"Removed existing output directory: {subdir}")
+ except Exception as e:
+ raise IOError(Errors.E901.format(path=path)) from e
diff --git a/spacy/util.py b/spacy/util.py
index 4d68e829c..aa321b22f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
+ PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
class registry(thinc.registry):
@@ -584,6 +585,33 @@ def get_base_version(version: str) -> str:
return Version(version).base_version
+def get_minor_version(version: str) -> Optional[str]:
+ """Get the major + minor version (without patch or prerelease identifiers).
+
+ version (str): The version.
+ RETURNS (str): The major + minor version or None if version is invalid.
+ """
+ try:
+ v = Version(version)
+ except (TypeError, InvalidVersion):
+ return None
+ return f"{v.major}.{v.minor}"
+
+
+def is_minor_version_match(version_a: str, version_b: str) -> bool:
+ """Compare two versions and check if they match in major and minor, without
+ patch or prerelease identifiers. Used internally for compatibility checks
+ that should be insensitive to patch releases.
+
+ version_a (str): The first version
+ version_b (str): The second version.
+ RETURNS (bool): Whether the versions match.
+ """
+ a = get_minor_version(version_a)
+ b = get_minor_version(version_b)
+ return a is not None and b is not None and a == b
+
+
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Load a model meta.json from a path and validate its contents.
@@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr)
return False
+
+
+def check_bool_env_var(env_var: str) -> bool:
+ """Convert the value of an environment variable to a boolean. Add special
+ check for "0" (falsy) and consider everything else truthy, except unset.
+
+ env_var (str): The name of the environment variable to check.
+ RETURNS (bool): Its boolean value.
+ """
+ value = os.environ.get(env_var, False)
+ if value == "0":
+ return False
+ return bool(value)
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index a22f12c65..93918250b 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -445,9 +445,9 @@ cdef class Vocab:
setters = ["strings", "vectors"]
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
- if "vectors" not in "exclude" and self.vectors is not None:
+ if "vectors" not in "exclude":
self.vectors.to_disk(path)
- if "lookups" not in "exclude" and self.lookups is not None:
+ if "lookups" not in "exclude":
self.lookups.to_disk(path)
def from_disk(self, path, *, exclude=tuple()):
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 5cee45ba5..5246a3ed6 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -136,25 +136,28 @@ argument that connects to the shared `tok2vec` component in the pipeline.
> [model]
> @architectures = "spacy.MultiHashEmbed.v1"
> width = 64
-> rows = 2000
-> also_embed_subwords = false
-> also_use_static_vectors = false
+> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+> rows = [2000, 1000, 1000, 1000]
+> include_static_vectors = true
> ```
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build mixed representations. The features used are
-the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
-[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
-vectors can also be incorporated into the concatenated representation.
+a feed-forward subnetwork to build a mixed representations. The features used
+can be configured with the `attrs` argument. The suggested attributes are
+`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
+some subword information, without construction a fully character-based
+representation. If pretrained vectors are available, they can be included in the
+representation as well, with the vectors table will be kept static (i.e. it's
+not updated).
-| Name | Description |
-| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
-| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
-| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
-| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
+| Name | Description |
+| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
+| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
+| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
+| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index 4f5ac6f61..de35f9eb4 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
Find the loss and gradient of loss for the batch of documents and their
predicted scores.
+
+
+This method needs to be overwritten with your own custom `get_loss` method.
+
+
+
> #### Example
>
> ```python
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index b65c3d903..24c7bf1cf 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -86,7 +86,8 @@ see are:
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
| ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. |
-The model type signatures help you figure out which model architectures and
+See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
+model type signatures help you figure out which model architectures and
components can **fit together**. For instance, the
[`TextCategorizer`](/api/textcategorizer) class expects a model typed
~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
@@ -288,7 +289,7 @@ those parts of the network.
To use our custom model including the PyTorch subnetwork, all we need to do is
register the architecture using the
-[`architectures` registry](/api/top-level#registry). This will assign the
+[`architectures` registry](/api/top-level#registry). This assigns the
architecture a name so spaCy knows how to find it, and allows passing in
arguments like hyperparameters via the [config](/usage/training#config). The
full example then becomes:
@@ -373,7 +374,7 @@ gpu_allocator = "pytorch"
Of course it's also possible to define the `Model` from the previous section
entirely in Thinc. The Thinc documentation provides details on the
[various layers](https://thinc.ai/docs/api-layers) and helper functions
-available. Combinators can also be used to
+available. Combinators can be used to
[overload operators](https://thinc.ai/docs/usage-models#operators) and a common
usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
simple neural network would then become:
@@ -486,28 +487,376 @@ with Model.define_operators({">>": chain}):
## Create new trainable components {#components}
-
+In addition to [swapping out](#swap-architectures) default models in built-in
+components, you can also implement an entirely new,
+[trainable](/usage/processing-pipelines#trainable-components) pipeline component
+from scratch. This can be done by creating a new class inheriting from
+[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
+
+
+
+For details on how to implement pipeline components, check out the usage guide
+on [custom components](/usage/processing-pipelines#custom-component) and the
+overview of the `Pipe` methods used by
+[trainable components](/usage/processing-pipelines#trainable-components).
+
-
+### Example: Entity elation extraction component {#component-rel}
-
+
+#### Step 1: Implementing the Model {#component-rel-model}
+
+We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
+**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
+matrix** (~~Floats2d~~) of predictions:
+
+> #### Model type annotations
+>
+> The `Model` class is a generic type that can specify its input and output
+> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
+> type checks and validation. See the section on [type signatures](#type-sigs)
+> for details.
```python
-def update(self, examples):
- docs = [ex.predicted for ex in examples]
- refs = [ex.reference for ex in examples]
- predictions, backprop = self.model.begin_update(docs)
- gradient = self.get_loss(predictions, refs)
- backprop(gradient)
-
-def __call__(self, doc):
- predictions = self.model([doc])
- self.set_annotations(predictions)
+### Register the model architecture
+@registry.architectures.register("rel_model.v1")
+def create_relation_model(...) -> Model[List[Doc], Floats2d]:
+ model = ... # 👈 model will go here
+ return model
```
--->
+
+The first layer in this model will typically be an
+[embedding layer](/usage/embeddings-transformers) such as a
+[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
+layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
+transforms each **document into a list of tokens**, with each token being
+represented by its embedding in the vector space.
+
+Next, we need a method that **generates pairs of entities** that we want to
+classify as being related or not. As these candidate pairs are typically formed
+within one document, this function takes a [`Doc`](/api/doc) as input and
+outputs a `List` of `Span` tuples. For instance, a very straightforward
+implementation would be to just take any two entities from the same document:
+
+```python
+### Simple candiate generation
+def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
+ candidates = []
+ for ent1 in doc.ents:
+ for ent2 in doc.ents:
+ candidates.append((ent1, ent2))
+ return candidates
+```
+
+But we could also refine this further by **excluding relations** of an entity
+with itself, and posing a **maximum distance** (in number of tokens) between two
+entities. We register this function in the
+[`@misc` registry](/api/top-level#registry) so we can refer to it from the
+config, and easily swap it out for any other candidate generation function.
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [model]
+> @architectures = "rel_model.v1"
+>
+> [model.tok2vec]
+> # ...
+>
+> [model.get_candidates]
+> @misc = "rel_cand_generator.v1"
+> max_length = 20
+> ```
+
+```python
+### Extended candidate generation {highlight="1,2,7,8"}
+@registry.misc.register("rel_cand_generator.v1")
+def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
+ def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
+ candidates = []
+ for ent1 in doc.ents:
+ for ent2 in doc.ents:
+ if ent1 != ent2:
+ if max_length and abs(ent2.start - ent1.start) <= max_length:
+ candidates.append((ent1, ent2))
+ return candidates
+ return get_candidates
+```
+
+Finally, we require a method that transforms the candidate entity pairs into a
+2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
+[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
+processed by a final `output_layer` of the network. Putting all this together,
+we can define our relation model in a config file as such:
+
+```ini
+### config.cfg
+[model]
+@architectures = "rel_model.v1"
+# ...
+
+[model.tok2vec]
+# ...
+
+[model.get_candidates]
+@misc = "rel_cand_generator.v2"
+max_length = 20
+
+[model.create_candidate_tensor]
+@misc = "rel_cand_tensor.v1"
+
+[model.output_layer]
+@architectures = "rel_output_layer.v1"
+# ...
+```
+
+
+
+
+When creating this model, we store the custom functions as
+[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
+references, so we can access them easily:
+
+```python
+tok2vec_layer = model.get_ref("tok2vec")
+output_layer = model.get_ref("output_layer")
+create_candidate_tensor = model.attrs["create_candidate_tensor"]
+get_candidates = model.attrs["get_candidates"]
+```
+
+#### Step 2: Implementing the pipeline component {#component-rel-pipe}
+
+To use our new relation extraction model as part of a custom
+[trainable component](/usage/processing-pipelines#trainable-components), we
+create a subclass of [`Pipe`](/api/pipe) that holds the model:
+
+```python
+### Pipeline component skeleton
+from spacy.pipeline import Pipe
+
+class RelationExtractor(Pipe):
+ def __init__(self, vocab, model, name="rel"):
+ """Create a component instance."""
+ self.model = model
+ self.vocab = vocab
+ self.name = name
+
+ def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
+ """Learn from a batch of Example objects."""
+ ...
+
+ def predict(self, docs):
+ """Apply the model to a batch of Doc objects."""
+ ...
+
+ def set_annotations(self, docs, predictions):
+ """Modify a batch of Doc objects using the predictions."""
+ ...
+
+ def initialize(self, get_examples, nlp=None, labels=None):
+ """Initialize the model before training."""
+ ...
+
+ def add_label(self, label):
+ """Add a label to the component."""
+ ...
+```
+
+Before the model can be used, it needs to be
+[initialized](/usage/training#initialization). This function receives a callback
+to access the full **training data set**, or a representative sample. This data
+set can be used to deduce all **relevant labels**. Alternatively, a list of
+labels can be provided to `initialize`, or you can call the
+`RelationExtractoradd_label` directly. The number of labels defines the output
+dimensionality of the network, and will be used to do
+[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
+layers of the neural network. This is triggered by calling
+[`Model.initialize`](https://thinc.ai/api/model#initialize).
+
+```python
+### The initialize method {highlight="12,18,22"}
+from itertools import islice
+
+def initialize(
+ self,
+ get_examples: Callable[[], Iterable[Example]],
+ *,
+ nlp: Language = None,
+ labels: Optional[List[str]] = None,
+):
+ if labels is not None:
+ for label in labels:
+ self.add_label(label)
+ else:
+ for example in get_examples():
+ relations = example.reference._.rel
+ for indices, label_dict in relations.items():
+ for label in label_dict.keys():
+ self.add_label(label)
+ subbatch = list(islice(get_examples(), 10))
+ doc_sample = [eg.reference for eg in subbatch]
+ label_sample = self._examples_to_truth(subbatch)
+ self.model.initialize(X=doc_sample, Y=label_sample)
+```
+
+The `initialize` method is triggered whenever this component is part of an `nlp`
+pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
+Typically, this happens when the pipeline is set up before training in
+[`spacy train`](/api/cli#training). After initialization, the pipeline component
+and its internal model can be trained and used to make predictions.
+
+During training, the function [`update`](/api/pipe#update) is invoked which
+delegates to
+[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
+[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
+batch of examples, as well as the **gradient** of loss that will be used to
+update the weights of the model layers. Thinc provides several
+[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
+implementation of the `get_loss` function.
+
+```python
+### The update method {highlight="12-14"}
+def update(
+ self,
+ examples: Iterable[Example],
+ *,
+ drop: float = 0.0,
+ set_annotations: bool = False,
+ sgd: Optional[Optimizer] = None,
+ losses: Optional[Dict[str, float]] = None,
+) -> Dict[str, float]:
+ ...
+ docs = [ex.predicted for ex in examples]
+ predictions, backprop = self.model.begin_update(docs)
+ loss, gradient = self.get_loss(examples, predictions)
+ backprop(gradient)
+ losses[self.name] += loss
+ ...
+ return losses
+```
+
+When the internal model is trained, the component can be used to make novel
+**predictions**. The [`predict`](/api/pipe#predict) function needs to be
+implemented for each subclass of `Pipe`. In our case, we can simply delegate to
+the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
+that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
+
+```python
+### The predict method
+def predict(self, docs: Iterable[Doc]) -> Floats2d:
+ predictions = self.model.predict(docs)
+ return self.model.ops.asarray(predictions)
+```
+
+The final method that needs to be implemented, is
+[`set_annotations`](/api/pipe#set_annotations). This function takes the
+predictions, and modifies the given `Doc` object in place to store them. For our
+relation extraction component, we store the data as a dictionary in a custom
+[extension attribute](/usage/processing-pipelines#custom-components-attributes)
+`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
+each entity**, as this defines an entity pair uniquely within one document.
+
+To interpret the scores predicted by the relation extraction model correctly, we
+need to refer to the model's `get_candidates` function that defined which pairs
+of entities were relevant candidates, so that the predictions can be linked to
+those exact entities:
+
+> #### Example output
+>
+> ```python
+> doc = nlp("Amsterdam is the capital of the Netherlands.")
+> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
+> for value, rel_dict in doc._.rel.items():
+> print(f"{value}: {rel_dict}")
+>
+> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
+> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
+> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
+> ```
+
+```python
+### Registering the extension attribute
+from spacy.tokens import Doc
+Doc.set_extension("rel", default={})
+```
+
+```python
+### The set_annotations method {highlight="5-6,10"}
+def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
+ c = 0
+ get_candidates = self.model.attrs["get_candidates"]
+ for doc in docs:
+ for (e1, e2) in get_candidates(doc):
+ offset = (e1.start, e2.start)
+ if offset not in doc._.rel:
+ doc._.rel[offset] = {}
+ for j, label in enumerate(self.labels):
+ doc._.rel[offset][label] = predictions[c, j]
+ c += 1
+```
+
+Under the hood, when the pipe is applied to a document, it delegates to the
+`predict` and `set_annotations` methods:
+
+```python
+### The __call__ method
+def __call__(self, Doc doc):
+ predictions = self.predict([doc])
+ self.set_annotations([doc], predictions)
+ return doc
+```
+
+Once our `Pipe` subclass is fully implemented, we can
+[register](/usage/processing-pipelines#custom-components-factories) the
+component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
+assigns it a name and lets you create the component with
+[`nlp.add_pipe`](/api/language#add_pipe) and via the
+[config](/usage/training#config).
+
+> #### config.cfg (excerpt)
+>
+> ```ini
+> [components.relation_extractor]
+> factory = "relation_extractor"
+>
+> [components.relation_extractor.model]
+> @architectures = "rel_model.v1"
+>
+> [components.relation_extractor.model.tok2vec]
+> # ...
+>
+> [components.relation_extractor.model.get_candidates]
+> @misc = "rel_cand_generator.v1"
+> max_length = 20
+> ```
+
+```python
+### Registering the pipeline component
+from spacy.language import Language
+
+@Language.factory("relation_extractor")
+def make_relation_extractor(nlp, name, model):
+ return RelationExtractor(nlp.vocab, model, name)
+```
+
+
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 3d0c7b7e9..c8224dfc9 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1176,7 +1176,7 @@ plug fully custom machine learning components into your pipeline. You'll need
the following:
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
- can be a model using implemented in
+ can be a model implemented in
[Thinc](/usage/layers-architectures#thinc), or a
[wrapped model](/usage/layers-architectures#frameworks) implemented in
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 6d5746308..5fced922d 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -216,15 +216,16 @@ pipelines.
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
```
-| Section | Description |
-| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
-| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
-| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
-| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
-| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
-| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
-| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
+| Section | Description |
+| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
+| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
+| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
+| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
+| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
+| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
+| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
+| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
### Data assets {#data-assets}
diff --git a/website/src/styles/quickstart.module.sass b/website/src/styles/quickstart.module.sass
index a08d6bcb6..8ad106a78 100644
--- a/website/src/styles/quickstart.module.sass
+++ b/website/src/styles/quickstart.module.sass
@@ -38,7 +38,7 @@
cursor: pointer
display: inline-block
padding: 0.35rem 0.5rem 0.25rem 0
- margin: 0 1rem 0.75rem 0
+ margin: 0 1rem 0.5rem 0
font-size: var(--font-size-xs)
font-weight: bold
@@ -73,16 +73,19 @@
background: var(--color-theme)
.checkbox + &:before
+ $size: 18px
content: ""
display: inline-block
- width: 20px
- height: 20px
+ width: $size
+ height: $size
border: 1px solid var(--color-subtle)
vertical-align: middle
margin-right: 0.5rem
cursor: pointer
- border-radius: var(--border-radius)
+ border-radius: $size / 4
background: var(--color-back)
+ position: relative
+ top: -1px
.checkbox:checked + &:before
// Embed "check" icon here for simplicity
diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 741973945..ab91b8e30 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby'
import { Quickstart, QS } from '../components/quickstart'
import { repo } from '../components/util'
+const DEFAULT_MODELS = ['en']
+const DEFAULT_OPT = 'efficiency'
const DEFAULT_HARDWARE = 'cpu'
const DEFAULT_CUDA = 'cuda100'
const CUDA = {
@@ -15,6 +17,7 @@ const CUDA = {
'10.1': 'cuda101',
'10.2': 'cuda102',
}
+const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
const DATA = [
{
id: 'os',
@@ -68,14 +71,24 @@ const QuickstartInstall = ({ id, title }) => {
const [train, setTrain] = useState(false)
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
const [cuda, setCuda] = useState(DEFAULT_CUDA)
+ const [selectedModels, setModels] = useState(DEFAULT_MODELS)
+ const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
const setters = {
hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
config: v => setTrain(v.includes('train')),
+ models: setModels,
+ optimize: v => setEfficiency(v.includes('efficiency')),
}
const showDropdown = {
hardware: () => hardware === 'gpu',
}
- const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
+ const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
+ const pipExtras = [
+ hardware === 'gpu' && cuda,
+ train && 'transformers',
+ train && 'lookups',
+ ...modelExtras,
+ ]
.filter(e => e)
.join(',')
return (
@@ -89,13 +102,37 @@ const QuickstartInstall = ({ id, title }) => {
...DATA,
{
id: 'models',
- title: 'Trained Pipelines',
+ title: 'Trained pipelines',
multiple: true,
options: models
.sort((a, b) => a.name.localeCompare(b.name))
- .map(({ code, name }) => ({ id: code, title: name })),
+ .map(({ code, name }) => ({
+ id: code,
+ title: name,
+ checked: DEFAULT_MODELS.includes(code),
+ })),
},
]
+ if (selectedModels.length) {
+ data.push({
+ id: 'optimize',
+ title: 'Select pipeline for',
+ options: [
+ {
+ id: 'efficiency',
+ title: 'efficiency',
+ checked: DEFAULT_OPT === 'efficiency',
+ help: 'Faster and smaller pipeline, but less accurate',
+ },
+ {
+ id: 'accuracy',
+ title: 'accuracy',
+ checked: DEFAULT_OPT === 'accuracy',
+ help: 'Larger and slower pipeline, but more accurate',
+ },
+ ],
+ })
+ }
return (
{
conda install -c conda-forge spacy-lookups-data
- {models.map(({ code, models: modelOptions }) => (
-
- python -m spacy download {modelOptions[0]}
-
- ))}
+ {models.map(({ code, models: modelOptions }) => {
+ const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
+ return (
+
+ python -m spacy download {pkg}
+
+ )
+ })}
)
}}
diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js
index ffd1b3df9..5f94c60cb 100644
--- a/website/src/widgets/quickstart-models.js
+++ b/website/src/widgets/quickstart-models.js
@@ -31,25 +31,33 @@ const data = [
},
{
id: 'optimize',
- title: 'Optimize for',
- help:
- 'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
+ title: 'Select for',
options: [
- { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
- { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
+ {
+ id: 'efficiency',
+ title: 'efficiency',
+ checked: DEFAULT_OPT === 'efficiency',
+ help: 'Faster and smaller pipeline, but less accurate',
+ },
+ {
+ id: 'accuracy',
+ title: 'accuracy',
+ checked: DEFAULT_OPT === 'accuracy',
+ help: 'Larger and slower pipeline, but more accurate',
+ },
],
},
{
id: 'config',
title: 'Options',
multiple: true,
- options: [{ id: 'example', title: 'Show usage example' }],
+ options: [{ id: 'example', title: 'Show text example' }],
},
]
const QuickstartInstall = ({ id, title, description, children }) => {
const [lang, setLang] = useState(DEFAULT_LANG)
- const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
+ const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
const setters = {
lang: setLang,
optimize: v => setEfficiency(v.includes('efficiency')),