Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-10-05 22:01:13 +02:00
commit e8156d191f
47 changed files with 943 additions and 380 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash
ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
endif
ifndef PYVER

View File

@ -7,7 +7,7 @@ requires = [
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a43,<8.0.0a50",
"blis>=0.4.0,<0.5.0",
"blis>=0.4.0,<0.8.0",
"pytokenizations",
"pathy"
]

View File

@ -2,7 +2,7 @@
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0
blis>=0.4.0,<0.8.0
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0

View File

@ -41,7 +41,7 @@ install_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a43,<8.0.0a50
blis>=0.4.0,<0.5.0
blis>=0.4.0,<0.8.0
wasabi>=0.8.0,<1.1.0
srsly>=2.3.0,<3.0.0
catalogue>=2.0.1,<2.1.0
@ -92,6 +92,8 @@ ko =
natto-py==0.9.0
th =
pythainlp>=2.0
zh =
spacy-pkuseg==0.0.26
[bdist_wheel]
universal = false

View File

@ -1,7 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a32"
__release__ = True
__version__ = "3.0.0a34"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -1,4 +1,4 @@
from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
import sys
import shutil
from pathlib import Path
@ -16,7 +16,8 @@ import os
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import ENV_VARS
from ..util import is_compatible_version, ENV_VARS
from .. import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
msg.fail(invalid_err)
print("\n".join(errors))
sys.exit(1)
validate_project_version(config)
validate_project_commands(config)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
return dict(interpolated["project"])
def validate_project_version(config: Dict[str, Any]) -> None:
"""If the project defines a compatible spaCy version range, chec that it's
compatible with the current version of spaCy.
config (Dict[str, Any]): The loaded config.
"""
spacy_version = config.get("spacy_version", None)
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
err = (
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
f"that's not compatible with the version of spaCy you're running "
f"({about.__version__}). You can edit version requirement in the "
f"{PROJECT_FILE} to load it, but the project may not run as expected."
)
msg.fail(err, exits=1)
def validate_project_commands(config: Dict[str, Any]) -> None:
"""Check that project commands and workflows are valid, don't contain
duplicates, don't clash and only refer to commands that exist.
@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
)
def get_hash(data) -> str:
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
"""Get the hash for a JSON-serializable object.
data: The data to hash.
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
RETURNS (str): The hash.
"""
if isinstance(data, dict):
data = {k: v for k, v in data.items() if k not in exclude}
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest()

View File

@ -7,7 +7,9 @@ import tarfile
from pathlib import Path
from .._util import get_hash, get_checksum, download_file, ensure_pathy
from ...util import make_tempdir
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
if TYPE_CHECKING:
from pathy import Pathy # noqa: F401
@ -129,7 +131,10 @@ def get_command_hash(
currently installed packages, whatever environment variables have been marked
as relevant, and the command.
"""
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest()

View File

@ -4,8 +4,11 @@ from wasabi import msg
import sys
import srsly
from ... import about
from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@ -62,12 +65,13 @@ def project_run(
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, err_help, **err_kwargs)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
rerun = check_rerun(current_dir, cmd)
msg.divider(subcommand)
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed")
else:
msg.divider(subcommand)
run_commands(cmd["script"], dry=dry)
if not dry:
update_lockfile(current_dir, cmd)
@ -171,12 +175,19 @@ def validate_subcommand(
)
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
def check_rerun(
project_dir: Path,
command: Dict[str, Any],
*,
check_spacy_version: bool = True,
check_spacy_commit: bool = False,
) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs
changed.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
strict_version (bool):
RETURNS (bool): Whether to re-run the command.
"""
lock_path = project_dir / PROJECT_LOCK
@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
# Always run commands with no outputs (otherwise they'd always be skipped)
if not entry.get("outs", []):
return True
# Always rerun if spaCy version or commit hash changed
spacy_v = entry.get("spacy_version")
commit = entry.get("spacy_git_version")
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
return True
if check_spacy_commit and commit != GIT_VERSION:
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
return True
# If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
lock_entry = get_lock_entry(project_dir, command)
exclude = ["spacy_version", "spacy_git_version"]
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
"script": command["script"],
"deps": deps,
"outs": [*outs, *outs_nc],
"spacy_version": about.__version__,
"spacy_git_version": GIT_VERSION,
}

View File

@ -171,9 +171,14 @@ factory = "tok2vec"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
also_embed_subwords = {{ "true" if has_letters else "false" }}
also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
{% if has_letters -%}
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
rows = [5000, 2500, 2500, 2500]
{% else -%}
attrs = ["ORTH", "SHAPE"]
rows = [5000, 2500]
{% endif -%}
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"

View File

@ -456,10 +456,14 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master
E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
E901 = ("Failed to remove existing output directory: {path}. If your "
"config and the components you train change between runs, a "
"non-empty output directory can lead to stale pipeline data. To "
"solve this, remove the existing directories in the output directory.")
E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
"Try checking whitespace and delimiters. See "
"https://nightly.spacy.io/api/cli#convert")
E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
"dimension refers to the output width, after the linear projection "

View File

@ -25,8 +25,14 @@ class Russian(Language):
default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
overwrite: bool = False,
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Russian"]

View File

@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
from thinc.api import Model
from ...lookups import Lookups
from ...pipeline import Lemmatizer
from ...symbols import POS
from ...tokens import Token
@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
lookups: Optional[Lookups] = None,
overwrite: bool = False,
) -> None:
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
try:
from pymorphy2 import MorphAnalyzer

View File

@ -26,8 +26,10 @@ class Ukrainian(Language):
default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
def make_lemmatizer(
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Ukrainian"]

View File

@ -3,7 +3,6 @@ from typing import Optional
from thinc.api import Model
from ..ru.lemmatizer import RussianLemmatizer
from ...lookups import Lookups
from ...vocab import Vocab
@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
name: str = "lemmatizer",
*,
mode: str = "pymorphy2",
lookups: Optional[Lookups] = None,
overwrite: bool = False,
) -> None:
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
try:
from pymorphy2 import MorphAnalyzer
except ImportError:

View File

@ -17,8 +17,7 @@ from ... import util
# fmt: off
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
# fmt: on
DEFAULT_CONFIG = """
@ -55,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
class ChineseTokenizer(DummyTokenizer):
def __init__(
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
):
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
self.vocab = nlp.vocab
if isinstance(segmenter, Segmenter):
segmenter = segmenter.value
@ -82,11 +79,13 @@ class ChineseTokenizer(DummyTokenizer):
*,
nlp: Optional[Language] = None,
pkuseg_model: Optional[str] = None,
pkuseg_user_dict: str = "default",
pkuseg_user_dict: Optional[str] = "default",
):
if self.segmenter == Segmenter.pkuseg:
if pkuseg_user_dict is None:
pkuseg_user_dict = pkuseg_model
self.pkuseg_seg = try_pkuseg_import(
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
)
def __call__(self, text: str) -> Doc:
@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
if self.segmenter == Segmenter.pkuseg:
if reset:
try:
import pkuseg
import spacy_pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
except ImportError:
msg = (
"pkuseg not installed: unable to reset pkuseg "
"spacy_pkuseg not installed: unable to reset pkuseg "
"user dict. Please " + _PKUSEG_INSTALL_MSG
)
raise ImportError(msg) from None
@ -156,23 +155,7 @@ class ChineseTokenizer(DummyTokenizer):
self.pkuseg_seg.feature_extractor.save(tempdir)
self.pkuseg_seg.model.save(tempdir)
tempdir = Path(tempdir)
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
# means that it will be saved with pickle protocol 5 with
# python 3.8, which can't be reloaded with python 3.6-3.7.
# To try to make the model compatible with python 3.6+, reload
# the data with pickle5 and convert it back to protocol 4.
try:
import pickle5
with open(tempdir / "features.pkl", "rb") as fileh:
features = pickle5.load(fileh)
with open(tempdir / "features.pkl", "wb") as fileh:
pickle5.dump(features, fileh, protocol=4)
except ImportError as e:
raise e
except Exception:
warnings.warn(_PKUSEG_PICKLE_WARNING)
with open(tempdir / "features.pkl", "rb") as fileh:
with open(tempdir / "features.msgpack", "rb") as fileh:
pkuseg_features_b = fileh.read()
with open(tempdir / "weights.npz", "rb") as fileh:
pkuseg_weights_b = fileh.read()
@ -213,22 +196,22 @@ class ChineseTokenizer(DummyTokenizer):
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir)
with open(tempdir / "features.pkl", "wb") as fileh:
with open(tempdir / "features.msgpack", "wb") as fileh:
fileh.write(pkuseg_data["features_b"])
with open(tempdir / "weights.npz", "wb") as fileh:
fileh.write(pkuseg_data["weights_b"])
try:
import pkuseg
import spacy_pkuseg
except ImportError:
raise ImportError(
"pkuseg not installed. To use this model, "
"spacy-pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
) from None
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
if pkuseg_data["processors_data"]:
processors_data = pkuseg_data["processors_data"]
(user_dict, do_process, common_words, other_words) = processors_data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
path.mkdir(parents=True)
self.pkuseg_seg.model.save(path)
self.pkuseg_seg.feature_extractor.save(path)
# try to convert features.pkl to pickle protocol 4
try:
import pickle5
with open(path / "features.pkl", "rb") as fileh:
features = pickle5.load(fileh)
with open(path / "features.pkl", "wb") as fileh:
pickle5.dump(features, fileh, protocol=4)
except ImportError as e:
raise e
except Exception:
warnings.warn(_PKUSEG_PICKLE_WARNING)
def save_pkuseg_processors(path):
if self.pkuseg_seg:
@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
def load_pkuseg_model(path):
try:
import pkuseg
import spacy_pkuseg
except ImportError:
if self.segmenter == Segmenter.pkuseg:
raise ImportError(
"pkuseg not installed. To use this model, "
"spacy-pkuseg not installed. To use this model, "
+ _PKUSEG_INSTALL_MSG
) from None
if path.exists():
self.pkuseg_seg = pkuseg.pkuseg(path)
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
def load_pkuseg_processors(path):
try:
import pkuseg
import spacy_pkuseg
except ImportError:
if self.segmenter == Segmenter.pkuseg:
raise ImportError(self._pkuseg_install_msg) from None
if self.segmenter == Segmenter.pkuseg:
data = srsly.read_msgpack(path)
(user_dict, do_process, common_words, other_words) = data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words)
self.pkuseg_seg.postprocesser.other_words = set(other_words)
@ -341,12 +312,13 @@ def try_jieba_import() -> None:
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
try:
import pkuseg
import spacy_pkuseg
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except ImportError:
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
raise ImportError(msg) from None
try:
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
except FileNotFoundError:
msg = "Unable to load pkuseg model from: " + pkuseg_model
raise FileNotFoundError(msg) from None

View File

@ -289,13 +289,12 @@ class Lookups:
DOCS: https://nightly.spacy.io/api/lookups#to_disk
"""
if len(self._tables):
path = ensure_path(path)
if not path.exists():
path.mkdir()
filepath = path / filename
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
path = ensure_path(path)
if not path.exists():
path.mkdir()
filepath = path / filename
with filepath.open("wb") as file_:
file_.write(self.to_bytes())
def from_disk(
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs

View File

@ -11,7 +11,7 @@ from ...ml import _character_embed
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
from ...attrs import intify_attr
@registry.architectures.register("spacy.Tok2VecListener.v1")
@ -29,7 +29,7 @@ def build_hash_embed_cnn_tok2vec(
window_size: int,
maxout_pieces: int,
subword_features: bool,
pretrained_vectors: Optional[bool]
pretrained_vectors: Optional[bool],
) -> Model[List[Doc], List[Floats2d]]:
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
with subword features and a CNN with layer-normalized maxout.
@ -54,12 +54,18 @@ def build_hash_embed_cnn_tok2vec(
a language such as Chinese.
pretrained_vectors (bool): Whether to also use static vectors.
"""
if subword_features:
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2]
else:
attrs = ["NORM"]
row_sizes = [embed_size]
return build_Tok2Vec_model(
embed=MultiHashEmbed(
width=width,
rows=embed_size,
also_embed_subwords=subword_features,
also_use_static_vectors=bool(pretrained_vectors),
rows=row_sizes,
attrs=attrs,
include_static_vectors=bool(pretrained_vectors),
),
encode=MaxoutWindowEncoder(
width=width,
@ -93,58 +99,59 @@ def build_Tok2Vec_model(
@registry.architectures.register("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
width: int,
attrs: List[Union[str, int]],
rows: List[int],
include_static_vectors: bool,
) -> Model[List[Doc], List[Floats2d]]:
"""Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it
through a feed-forward subnetwork to build a mixed representations.
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
varying definitions depending on the Vocab of the Doc object passed in.
Vectors from pretrained static vectors can also be incorporated into the
concatenated representation.
The features used can be configured with the 'attrs' argument. The suggested
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
account some subword information, without construction a fully character-based
representation. If pretrained vectors are available, they can be included in
the representation as well, with the vectors table will be kept static
(i.e. it's not updated).
The `width` parameter specifies the output width of the layer and the widths
of all embedding tables. If static vectors are included, a learned linear
layer is used to map the vectors to the specified width before concatenating
it with the other embedding outputs. A single Maxout layer is then used to
reduce the concatenated vectors to the final width.
The `rows` parameter controls the number of rows used by the `HashEmbed`
tables. The HashEmbed layer needs surprisingly few rows, due to its use of
the hashing trick. Generally between 2000 and 10000 rows is sufficient,
even for very large vocabularies. A number of rows must be specified for each
table, so the `rows` list must be of the same length as the `attrs` parameter.
width (int): The output width. Also used as the width of the embedding tables.
Recommended values are between 64 and 300.
rows (int): The number of rows for the embedding tables. Can be low, due
to the hashing trick. Embeddings for prefix, suffix and word shape
use half as many rows. Recommended values are between 2000 and 10000.
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
features in the embeddings. If not using these, you may need more
rows in your hash embeddings, as there will be increased chance of
collisions.
also_use_static_vectors (bool): Whether to also use static word vectors.
attrs (list of attr IDs): The token attributes to embed. A separate
embedding table will be constructed for each attribute.
rows (List[int]): The number of rows in the embedding tables. Must have the
same length as attrs.
include_static_vectors (bool): Whether to also use static word vectors.
Requires a vectors table to be loaded in the Doc objects' vocab.
"""
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
if len(rows) != len(attrs):
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
seed = 7
def make_hash_embed(feature):
def make_hash_embed(index):
nonlocal seed
seed += 1
return HashEmbed(
width,
rows if feature == LOWER else rows // 2,
column=cols.index(feature),
seed=seed,
dropout=0.0,
)
return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0)
if also_embed_subwords:
embeddings = [
make_hash_embed(LOWER),
make_hash_embed(PREFIX),
make_hash_embed(SUFFIX),
make_hash_embed(SHAPE),
]
else:
embeddings = [make_hash_embed(LOWER)]
concat_size = width * (len(embeddings) + also_use_static_vectors)
if also_use_static_vectors:
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
concat_size = width * (len(embeddings) + include_static_vectors)
if include_static_vectors:
model = chain(
concatenate(
chain(
FeatureExtractor(cols),
FeatureExtractor(attrs),
list2ragged(),
with_array(concatenate(*embeddings)),
),
@ -155,7 +162,7 @@ def MultiHashEmbed(
)
else:
model = chain(
FeatureExtractor(cols),
FeatureExtractor(list(attrs)),
list2ragged(),
with_array(concatenate(*embeddings)),
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),

View File

@ -210,7 +210,7 @@ class Morphologizer(Tagger):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
"""

View File

@ -162,7 +162,7 @@ cdef class Pipe:
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/pipe#get_loss
"""

View File

@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
"""

View File

@ -249,7 +249,7 @@ class Tagger(Pipe):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/tagger#get_loss
"""

View File

@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETUTNRS (Tuple[float, float]): The loss and the gradient.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
"""

View File

@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
title: Optional[str] = Field(None, title="Project title")
spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
# fmt: on
class Config:

View File

@ -248,7 +248,6 @@ def tt_tokenizer():
@pytest.fixture(scope="session")
def uk_tokenizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2.lang")
return get_lang_class("uk")().tokenizer
@ -285,8 +284,7 @@ def zh_tokenizer_jieba():
@pytest.fixture(scope="session")
def zh_tokenizer_pkuseg():
pytest.importorskip("pkuseg")
pytest.importorskip("pickle5")
pytest.importorskip("spacy_pkuseg")
config = {
"nlp": {
"tokenizer": {
@ -296,7 +294,7 @@ def zh_tokenizer_pkuseg():
},
"initialize": {
"tokenizer": {
"pkuseg_model": "default",
"pkuseg_model": "web",
}
},
}

View File

@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab):
# Retokenize to split out the words in the token at doc[2].
token = doc[2]
with doc.retokenize() as retokenizer:
retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
retokenizer.split(
token,
["brown", "fox", "jumps", "over", "the"],
heads=[(token, idx) for idx in range(5)],
)
assert doc[9].text == "w/"
assert doc[9].text == "w/"
assert doc[9].norm_ == "with"
assert doc[5].text == "over"
assert doc[5].text == "over"
assert doc[5].norm_ == "over"

View File

@ -350,7 +350,7 @@ def test_pipe_methods_frozen():
@pytest.mark.parametrize(
"pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
"pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"]
)
def test_pipe_label_data_exports_labels(pipe):
nlp = Language()

View File

@ -24,9 +24,9 @@ def test_empty_doc():
tok2vec = build_Tok2Vec_model(
MultiHashEmbed(
width=width,
rows=embed_size,
also_use_static_vectors=False,
also_embed_subwords=True,
rows=[embed_size, embed_size, embed_size, embed_size],
include_static_vectors=False,
attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
),
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
@ -44,9 +44,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
tok2vec = build_Tok2Vec_model(
MultiHashEmbed(
width=width,
rows=embed_size,
also_use_static_vectors=False,
also_embed_subwords=True,
rows=[embed_size] * 4,
include_static_vectors=False,
attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
),
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
@pytest.mark.parametrize(
"width,embed_arch,embed_config,encode_arch,encode_config",
[
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
],
@ -118,9 +118,9 @@ cfg_string = """
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
rows = 2000
also_embed_subwords = true
also_use_static_vectors = false
rows = [2000, 1000, 1000, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"

View File

@ -1,6 +1,5 @@
from spacy.lang.en import English
from spacy.pipeline import merge_entities
import pytest
def test_issue5918():
@ -23,7 +22,8 @@ def test_issue5918():
assert len(doc.ents) == 3
# make it so that the third span's head is within the entity (ent_iob=I)
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
with pytest.warns(UserWarning):
doc[29].head = doc[33]
# TODO: test for logging here
# with pytest.warns(UserWarning):
# doc[29].head = doc[33]
doc = merge_entities(doc)
assert len(doc.ents) == 3

View File

@ -89,9 +89,9 @@ def my_parser():
tok2vec = build_Tok2Vec_model(
MultiHashEmbed(
width=321,
rows=5432,
also_embed_subwords=True,
also_use_static_vectors=False,
attrs=["LOWER", "SHAPE"],
rows=[5432, 5432],
include_static_vectors=False,
),
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
)

View File

@ -7,6 +7,15 @@ from spacy import util
from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.fixture
@ -140,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
assert util.is_unconstrained_version(constraint) is expected
@pytest.mark.parametrize(
"a1,a2,b1,b2,is_match",
[
("3.0.0", "3.0", "3.0.1", "3.0", True),
("3.1.0", "3.1", "3.2.1", "3.2", False),
("xxx", None, "1.2.3.dev0", "1.2", False),
],
)
def test_minor_version(a1, a2, b1, b2, is_match):
assert util.get_minor_version(a1) == a2
assert util.get_minor_version(b1) == b2
assert util.is_minor_version_match(a1, b1) is is_match
assert util.is_minor_version_match(a2, b2) is is_match
@pytest.mark.parametrize(
"dot_notation,expected",
[
@ -157,3 +181,128 @@ def test_dot_to_dict(dot_notation, expected):
result = util.dot_to_dict(dot_notation)
assert result == expected
assert util.dict_to_dot(result) == dot_notation
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches
def test_util_dot_section():
cfg_string = """
[nlp]
lang = "en"
pipeline = ["textcat"]
[components]
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
nlp_config = Config().from_str(cfg_string)
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["training.optimizer"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -61,7 +61,10 @@ def get_tok2vec_kwargs():
# This actually creates models, so seems best to put it in a function.
return {
"embed": MultiHashEmbed(
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
width=32,
rows=[500, 500, 500],
attrs=["NORM", "PREFIX", "SHAPE"],
include_static_vectors=False,
),
"encode": MaxoutWindowEncoder(
width=32, depth=2, maxout_pieces=2, window_size=1
@ -73,6 +76,32 @@ def test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs())
def test_multi_hash_embed():
embed = MultiHashEmbed(
width=32,
rows=[500, 500, 500],
attrs=["NORM", "PREFIX", "SHAPE"],
include_static_vectors=False,
)
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
assert len(hash_embeds) == 3
# Check they look at different columns.
assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
# Check they use different seeds
assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
# Check they all have the same number of rows
assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
# Now try with different row factors
embed = MultiHashEmbed(
width=32,
rows=[1000, 50, 250],
attrs=["NORM", "PREFIX", "SHAPE"],
include_static_vectors=False,
)
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
@pytest.mark.parametrize(
"seed,model_func,kwargs",
[

View File

@ -1,137 +0,0 @@
import pytest
from spacy import util
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer, ConfigValidationError
from spacy.training.batchers import minibatch_by_words
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import DEFAULT_CONFIG_PATH
from spacy.schemas import ConfigSchemaTraining
from .util import get_random_doc
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
)
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(doc) for doc in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
tol = 0.2
batch_size = 1000
batches = list(
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
)
assert [len(batch) for batch in batches] == expected_batches
def test_util_dot_section():
cfg_string = """
[nlp]
lang = "en"
pipeline = ["textcat"]
[components]
[components.textcat]
factory = "textcat"
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = true
ngram_size = 1
no_output_layer = false
"""
nlp_config = Config().from_str(cfg_string)
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
default_config["nlp"]["lang"] = "nl"
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
# Test that creation went OK
assert isinstance(en_nlp, English)
assert isinstance(nl_nlp, Dutch)
assert nl_nlp.pipe_names == []
assert en_nlp.pipe_names == ["textcat"]
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_nlp.config, "nlp.unknownattribute")
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")
def test_resolve_dot_names():
config = {
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
}
result = util.resolve_dot_names(config, ["training.optimizer"])
assert isinstance(result[0], Optimizer)
with pytest.raises(ConfigValidationError) as e:
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
errors = e.value.errors
assert len(errors) == 1
assert errors[0]["loc"] == ["training", "xyz"]

View File

@ -1528,7 +1528,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
while not heads_within_sents:
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
if loop_count > 10:
warnings.warn(Warnings.W026)
util.logger.debug(Warnings.W026)
break
loop_count += 1
# Set sentence starts

View File

@ -5,7 +5,7 @@ import copy
from functools import partial
from pydantic import BaseModel, StrictStr
from ..util import registry, logger
from ..util import registry
from ..tokens import Doc
from .example import Example
@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
def lower_casing_augmenter(
nlp: "Language", example: Example, *, level: float,
nlp: "Language", example: Example, *, level: float
) -> Iterator[Example]:
if random.random() >= level:
yield example
@ -119,9 +119,8 @@ def make_orth_variants(
orig_token_dict = copy.deepcopy(token_dict)
ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", [])
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
words = token_dict.get("words", [])
tags = token_dict.get("tags", [])
words = token_dict.get("ORTH", [])
tags = token_dict.get("TAG", [])
# keep unmodified if words or tags are not defined
if words and tags:
if lower:
@ -154,8 +153,8 @@ def make_orth_variants(
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict["words"] = words
token_dict["tags"] = tags
token_dict["ORTH"] = words
token_dict["TAG"] = tags
# modify raw
if raw is not None:
variants = []

View File

@ -103,7 +103,7 @@ def conll_ner_to_docs(
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2:
raise ValueError(Errors.E093)
raise ValueError(Errors.E903)
length = len(cols[0])
words.extend(cols[0])
sent_starts.extend([True] + [False] * (length - 1))

View File

@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
sent_words, sent_iob = zip(*sent_tokens)
sent_tags = ["-"] * len(sent_words)
else:
raise ValueError(Errors.E092)
raise ValueError(Errors.E902)
words.extend(sent_words)
tags.extend(sent_tags)
iob.extend(sent_iob)

View File

@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING
from pathlib import Path
from timeit import default_timer as timer
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
from wasabi import Printer
import random
import wasabi
import sys
import shutil
from .example import Example
from ..schemas import ConfigSchemaTraining
from ..errors import Errors
from ..util import resolve_dot_names, registry
from ..util import resolve_dot_names, registry, logger
if TYPE_CHECKING:
from ..language import Language # noqa: F401
DIR_MODEL_BEST = "model-best"
DIR_MODEL_LAST = "model-last"
def train(
nlp: "Language",
output_path: Optional[Path] = None,
@ -38,7 +43,7 @@ def train(
RETURNS (Path / None): The path to the final exported model.
"""
# We use no_print here so we can respect the stdout/stderr options.
msg = wasabi.Printer(no_print=True)
msg = Printer(no_print=True)
# Create iterator, which yields out info after each optimization step.
config = nlp.config.interpolate()
if config["training"]["seed"] is not None:
@ -69,6 +74,7 @@ def train(
eval_frequency=T["eval_frequency"],
exclude=frozen_components,
)
clean_output_dir(output_path)
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
if frozen_components:
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
@ -83,7 +89,7 @@ def train(
update_meta(T, nlp, info)
with nlp.use_params(optimizer.averages):
nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best")
nlp.to_disk(output_path / DIR_MODEL_BEST)
except Exception as e:
if output_path is not None:
# We don't want to swallow the traceback if we don't have a
@ -100,7 +106,7 @@ def train(
finally:
finalize_logger()
if output_path is not None:
final_model_path = output_path / "model-last"
final_model_path = output_path / DIR_MODEL_LAST
if optimizer.averages:
with nlp.use_params(optimizer.averages):
nlp.to_disk(final_model_path)
@ -305,3 +311,19 @@ def create_before_to_disk_callback(
return modified_nlp
return before_to_disk
def clean_output_dir(path: Union[str, Path]) -> None:
"""Remove an existing output directory. Typically used to ensure that that
a directory like model-best and its contents aren't just being overwritten
by nlp.to_disk, which could preserve existing subdirectories (e.g.
components that don't exist anymore).
"""
if path is not None and path.exists():
for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]:
if subdir.exists():
try:
shutil.rmtree(str(subdir))
logger.debug(f"Removed existing output directory: {subdir}")
except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e

View File

@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
class ENV_VARS:
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
class registry(thinc.registry):
@ -584,6 +585,33 @@ def get_base_version(version: str) -> str:
return Version(version).base_version
def get_minor_version(version: str) -> Optional[str]:
"""Get the major + minor version (without patch or prerelease identifiers).
version (str): The version.
RETURNS (str): The major + minor version or None if version is invalid.
"""
try:
v = Version(version)
except (TypeError, InvalidVersion):
return None
return f"{v.major}.{v.minor}"
def is_minor_version_match(version_a: str, version_b: str) -> bool:
"""Compare two versions and check if they match in major and minor, without
patch or prerelease identifiers. Used internally for compatibility checks
that should be insensitive to patch releases.
version_a (str): The first version
version_b (str): The second version.
RETURNS (bool): Whether the versions match.
"""
a = get_minor_version(version_a)
b = get_minor_version(version_b)
return a is not None and b is not None and a == b
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
"""Load a model meta.json from a path and validate its contents.
@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
return hasattr(cls_func, attr)
return False
def check_bool_env_var(env_var: str) -> bool:
"""Convert the value of an environment variable to a boolean. Add special
check for "0" (falsy) and consider everything else truthy, except unset.
env_var (str): The name of the environment variable to check.
RETURNS (bool): Its boolean value.
"""
value = os.environ.get(env_var, False)
if value == "0":
return False
return bool(value)

View File

@ -445,9 +445,9 @@ cdef class Vocab:
setters = ["strings", "vectors"]
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
if "vectors" not in "exclude" and self.vectors is not None:
if "vectors" not in "exclude":
self.vectors.to_disk(path)
if "lookups" not in "exclude" and self.lookups is not None:
if "lookups" not in "exclude":
self.lookups.to_disk(path)
def from_disk(self, path, *, exclude=tuple()):

View File

@ -136,25 +136,28 @@ argument that connects to the shared `tok2vec` component in the pipeline.
> [model]
> @architectures = "spacy.MultiHashEmbed.v1"
> width = 64
> rows = 2000
> also_embed_subwords = false
> also_use_static_vectors = false
> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
> rows = [2000, 1000, 1000, 1000]
> include_static_vectors = true
> ```
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build mixed representations. The features used are
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
vectors can also be incorporated into the concatenated representation.
a feed-forward subnetwork to build a mixed representations. The features used
can be configured with the `attrs` argument. The suggested attributes are
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
some subword information, without construction a fully character-based
representation. If pretrained vectors are available, they can be included in the
representation as well, with the vectors table will be kept static (i.e. it's
not updated).
| Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
| Name | Description |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.CharacterEmbed.v1 {#CharacterEmbed}

View File

@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
Find the loss and gradient of loss for the batch of documents and their
predicted scores.
<Infobox variant="danger">
This method needs to be overwritten with your own custom `get_loss` method.
</Infobox>
> #### Example
>
> ```python

View File

@ -86,7 +86,8 @@ see are:
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
| ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. |
The model type signatures help you figure out which model architectures and
See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
model type signatures help you figure out which model architectures and
components can **fit together**. For instance, the
[`TextCategorizer`](/api/textcategorizer) class expects a model typed
~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
@ -288,7 +289,7 @@ those parts of the network.
To use our custom model including the PyTorch subnetwork, all we need to do is
register the architecture using the
[`architectures` registry](/api/top-level#registry). This will assign the
[`architectures` registry](/api/top-level#registry). This assigns the
architecture a name so spaCy knows how to find it, and allows passing in
arguments like hyperparameters via the [config](/usage/training#config). The
full example then becomes:
@ -373,7 +374,7 @@ gpu_allocator = "pytorch"
Of course it's also possible to define the `Model` from the previous section
entirely in Thinc. The Thinc documentation provides details on the
[various layers](https://thinc.ai/docs/api-layers) and helper functions
available. Combinators can also be used to
available. Combinators can be used to
[overload operators](https://thinc.ai/docs/usage-models#operators) and a common
usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
simple neural network would then become:
@ -486,28 +487,376 @@ with Model.define_operators({">>": chain}):
## Create new trainable components {#components}
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
In addition to [swapping out](#swap-architectures) default models in built-in
components, you can also implement an entirely new,
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
from scratch. This can be done by creating a new class inheriting from
[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
<Infobox title="Trainable component API" emoji="💡">
For details on how to implement pipeline components, check out the usage guide
on [custom components](/usage/processing-pipelines#custom-component) and the
overview of the `Pipe` methods used by
[trainable components](/usage/processing-pipelines#trainable-components).
</Infobox>
<!-- TODO: write trainable component section
- Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `initialize`, correlation with add_label
Example: relation extraction component (implemented as project template)
Avoid duplication with usage/processing-pipelines#trainable-components ?
-->
### Example: Entity elation extraction component {#component-rel}
<!-- ![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
This section outlines an example use-case of implementing a **novel relation
extraction component** from scratch. We'll implement a binary relation
extraction method that determines whether or not **two entities** in a document
are related, and if so, what type of relation. We'll allow multiple types of
relations between two such entities (multi-label setting). There are two major
steps required:
1. Implement a [machine learning model](#component-rel-model) specific to this
task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
a relation for the available candidate pairs.
2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
machine learning model that sets annotations on the [`Doc`](/api/doc) passing
through the pipeline.
<!-- TODO: <Project id="tutorials/ner-relations">
</Project> -->
#### Step 1: Implementing the Model {#component-rel-model}
We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
matrix** (~~Floats2d~~) of predictions:
> #### Model type annotations
>
> The `Model` class is a generic type that can specify its input and output
> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
> type checks and validation. See the section on [type signatures](#type-sigs)
> for details.
```python
def update(self, examples):
docs = [ex.predicted for ex in examples]
refs = [ex.reference for ex in examples]
predictions, backprop = self.model.begin_update(docs)
gradient = self.get_loss(predictions, refs)
backprop(gradient)
def __call__(self, doc):
predictions = self.model([doc])
self.set_annotations(predictions)
### Register the model architecture
@registry.architectures.register("rel_model.v1")
def create_relation_model(...) -> Model[List[Doc], Floats2d]:
model = ... # 👈 model will go here
return model
```
-->
The first layer in this model will typically be an
[embedding layer](/usage/embeddings-transformers) such as a
[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
transforms each **document into a list of tokens**, with each token being
represented by its embedding in the vector space.
Next, we need a method that **generates pairs of entities** that we want to
classify as being related or not. As these candidate pairs are typically formed
within one document, this function takes a [`Doc`](/api/doc) as input and
outputs a `List` of `Span` tuples. For instance, a very straightforward
implementation would be to just take any two entities from the same document:
```python
### Simple candiate generation
def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
candidates = []
for ent1 in doc.ents:
for ent2 in doc.ents:
candidates.append((ent1, ent2))
return candidates
```
But we could also refine this further by **excluding relations** of an entity
with itself, and posing a **maximum distance** (in number of tokens) between two
entities. We register this function in the
[`@misc` registry](/api/top-level#registry) so we can refer to it from the
config, and easily swap it out for any other candidate generation function.
> #### config.cfg (excerpt)
>
> ```ini
> [model]
> @architectures = "rel_model.v1"
>
> [model.tok2vec]
> # ...
>
> [model.get_candidates]
> @misc = "rel_cand_generator.v1"
> max_length = 20
> ```
```python
### Extended candidate generation {highlight="1,2,7,8"}
@registry.misc.register("rel_cand_generator.v1")
def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
candidates = []
for ent1 in doc.ents:
for ent2 in doc.ents:
if ent1 != ent2:
if max_length and abs(ent2.start - ent1.start) <= max_length:
candidates.append((ent1, ent2))
return candidates
return get_candidates
```
Finally, we require a method that transforms the candidate entity pairs into a
2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
processed by a final `output_layer` of the network. Putting all this together,
we can define our relation model in a config file as such:
```ini
### config.cfg
[model]
@architectures = "rel_model.v1"
# ...
[model.tok2vec]
# ...
[model.get_candidates]
@misc = "rel_cand_generator.v2"
max_length = 20
[model.create_candidate_tensor]
@misc = "rel_cand_tensor.v1"
[model.output_layer]
@architectures = "rel_output_layer.v1"
# ...
```
<!-- TODO: link to project for implementation details -->
<!-- TODO: maybe embed files from project that show the architectures? -->
When creating this model, we store the custom functions as
[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
references, so we can access them easily:
```python
tok2vec_layer = model.get_ref("tok2vec")
output_layer = model.get_ref("output_layer")
create_candidate_tensor = model.attrs["create_candidate_tensor"]
get_candidates = model.attrs["get_candidates"]
```
#### Step 2: Implementing the pipeline component {#component-rel-pipe}
To use our new relation extraction model as part of a custom
[trainable component](/usage/processing-pipelines#trainable-components), we
create a subclass of [`Pipe`](/api/pipe) that holds the model:
```python
### Pipeline component skeleton
from spacy.pipeline import Pipe
class RelationExtractor(Pipe):
def __init__(self, vocab, model, name="rel"):
"""Create a component instance."""
self.model = model
self.vocab = vocab
self.name = name
def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
"""Learn from a batch of Example objects."""
...
def predict(self, docs):
"""Apply the model to a batch of Doc objects."""
...
def set_annotations(self, docs, predictions):
"""Modify a batch of Doc objects using the predictions."""
...
def initialize(self, get_examples, nlp=None, labels=None):
"""Initialize the model before training."""
...
def add_label(self, label):
"""Add a label to the component."""
...
```
Before the model can be used, it needs to be
[initialized](/usage/training#initialization). This function receives a callback
to access the full **training data set**, or a representative sample. This data
set can be used to deduce all **relevant labels**. Alternatively, a list of
labels can be provided to `initialize`, or you can call the
`RelationExtractoradd_label` directly. The number of labels defines the output
dimensionality of the network, and will be used to do
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
layers of the neural network. This is triggered by calling
[`Model.initialize`](https://thinc.ai/api/model#initialize).
```python
### The initialize method {highlight="12,18,22"}
from itertools import islice
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Language = None,
labels: Optional[List[str]] = None,
):
if labels is not None:
for label in labels:
self.add_label(label)
else:
for example in get_examples():
relations = example.reference._.rel
for indices, label_dict in relations.items():
for label in label_dict.keys():
self.add_label(label)
subbatch = list(islice(get_examples(), 10))
doc_sample = [eg.reference for eg in subbatch]
label_sample = self._examples_to_truth(subbatch)
self.model.initialize(X=doc_sample, Y=label_sample)
```
The `initialize` method is triggered whenever this component is part of an `nlp`
pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
Typically, this happens when the pipeline is set up before training in
[`spacy train`](/api/cli#training). After initialization, the pipeline component
and its internal model can be trained and used to make predictions.
During training, the function [`update`](/api/pipe#update) is invoked which
delegates to
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
batch of examples, as well as the **gradient** of loss that will be used to
update the weights of the model layers. Thinc provides several
[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
implementation of the `get_loss` function.
```python
### The update method {highlight="12-14"}
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
set_annotations: bool = False,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
...
docs = [ex.predicted for ex in examples]
predictions, backprop = self.model.begin_update(docs)
loss, gradient = self.get_loss(examples, predictions)
backprop(gradient)
losses[self.name] += loss
...
return losses
```
When the internal model is trained, the component can be used to make novel
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
implemented for each subclass of `Pipe`. In our case, we can simply delegate to
the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
```python
### The predict method
def predict(self, docs: Iterable[Doc]) -> Floats2d:
predictions = self.model.predict(docs)
return self.model.ops.asarray(predictions)
```
The final method that needs to be implemented, is
[`set_annotations`](/api/pipe#set_annotations). This function takes the
predictions, and modifies the given `Doc` object in place to store them. For our
relation extraction component, we store the data as a dictionary in a custom
[extension attribute](/usage/processing-pipelines#custom-components-attributes)
`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
each entity**, as this defines an entity pair uniquely within one document.
To interpret the scores predicted by the relation extraction model correctly, we
need to refer to the model's `get_candidates` function that defined which pairs
of entities were relevant candidates, so that the predictions can be linked to
those exact entities:
> #### Example output
>
> ```python
> doc = nlp("Amsterdam is the capital of the Netherlands.")
> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
> for value, rel_dict in doc._.rel.items():
> print(f"{value}: {rel_dict}")
>
> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
> ```
```python
### Registering the extension attribute
from spacy.tokens import Doc
Doc.set_extension("rel", default={})
```
```python
### The set_annotations method {highlight="5-6,10"}
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
c = 0
get_candidates = self.model.attrs["get_candidates"]
for doc in docs:
for (e1, e2) in get_candidates(doc):
offset = (e1.start, e2.start)
if offset not in doc._.rel:
doc._.rel[offset] = {}
for j, label in enumerate(self.labels):
doc._.rel[offset][label] = predictions[c, j]
c += 1
```
Under the hood, when the pipe is applied to a document, it delegates to the
`predict` and `set_annotations` methods:
```python
### The __call__ method
def __call__(self, Doc doc):
predictions = self.predict([doc])
self.set_annotations([doc], predictions)
return doc
```
Once our `Pipe` subclass is fully implemented, we can
[register](/usage/processing-pipelines#custom-components-factories) the
component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
assigns it a name and lets you create the component with
[`nlp.add_pipe`](/api/language#add_pipe) and via the
[config](/usage/training#config).
> #### config.cfg (excerpt)
>
> ```ini
> [components.relation_extractor]
> factory = "relation_extractor"
>
> [components.relation_extractor.model]
> @architectures = "rel_model.v1"
>
> [components.relation_extractor.model.tok2vec]
> # ...
>
> [components.relation_extractor.model.get_candidates]
> @misc = "rel_cand_generator.v1"
> max_length = 20
> ```
```python
### Registering the pipeline component
from spacy.language import Language
@Language.factory("relation_extractor")
def make_relation_extractor(nlp, name, model):
return RelationExtractor(nlp.vocab, model, name)
```
<!-- TODO: <Project id="tutorials/ner-relations">
</Project> -->

View File

@ -1176,7 +1176,7 @@ plug fully custom machine learning components into your pipeline. You'll need
the following:
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
can be a model using implemented in
can be a model implemented in
[Thinc](/usage/layers-architectures#thinc), or a
[wrapped model](/usage/layers-architectures#frameworks) implemented in
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a

View File

@ -216,15 +216,16 @@ pipelines.
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
```
| Section | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
| Section | Description |
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
### Data assets {#data-assets}

View File

@ -38,7 +38,7 @@
cursor: pointer
display: inline-block
padding: 0.35rem 0.5rem 0.25rem 0
margin: 0 1rem 0.75rem 0
margin: 0 1rem 0.5rem 0
font-size: var(--font-size-xs)
font-weight: bold
@ -73,16 +73,19 @@
background: var(--color-theme)
.checkbox + &:before
$size: 18px
content: ""
display: inline-block
width: 20px
height: 20px
width: $size
height: $size
border: 1px solid var(--color-subtle)
vertical-align: middle
margin-right: 0.5rem
cursor: pointer
border-radius: var(--border-radius)
border-radius: $size / 4
background: var(--color-back)
position: relative
top: -1px
.checkbox:checked + &:before
// Embed "check" icon here for simplicity

View File

@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby'
import { Quickstart, QS } from '../components/quickstart'
import { repo } from '../components/util'
const DEFAULT_MODELS = ['en']
const DEFAULT_OPT = 'efficiency'
const DEFAULT_HARDWARE = 'cpu'
const DEFAULT_CUDA = 'cuda100'
const CUDA = {
@ -15,6 +17,7 @@ const CUDA = {
'10.1': 'cuda101',
'10.2': 'cuda102',
}
const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
const DATA = [
{
id: 'os',
@ -68,14 +71,24 @@ const QuickstartInstall = ({ id, title }) => {
const [train, setTrain] = useState(false)
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
const [cuda, setCuda] = useState(DEFAULT_CUDA)
const [selectedModels, setModels] = useState(DEFAULT_MODELS)
const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
const setters = {
hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
config: v => setTrain(v.includes('train')),
models: setModels,
optimize: v => setEfficiency(v.includes('efficiency')),
}
const showDropdown = {
hardware: () => hardware === 'gpu',
}
const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
const pipExtras = [
hardware === 'gpu' && cuda,
train && 'transformers',
train && 'lookups',
...modelExtras,
]
.filter(e => e)
.join(',')
return (
@ -89,13 +102,37 @@ const QuickstartInstall = ({ id, title }) => {
...DATA,
{
id: 'models',
title: 'Trained Pipelines',
title: 'Trained pipelines',
multiple: true,
options: models
.sort((a, b) => a.name.localeCompare(b.name))
.map(({ code, name }) => ({ id: code, title: name })),
.map(({ code, name }) => ({
id: code,
title: name,
checked: DEFAULT_MODELS.includes(code),
})),
},
]
if (selectedModels.length) {
data.push({
id: 'optimize',
title: 'Select pipeline for',
options: [
{
id: 'efficiency',
title: 'efficiency',
checked: DEFAULT_OPT === 'efficiency',
help: 'Faster and smaller pipeline, but less accurate',
},
{
id: 'accuracy',
title: 'accuracy',
checked: DEFAULT_OPT === 'accuracy',
help: 'Larger and slower pipeline, but more accurate',
},
],
})
}
return (
<Quickstart
data={data}
@ -149,11 +186,14 @@ const QuickstartInstall = ({ id, title }) => {
conda install -c conda-forge spacy-lookups-data
</QS>
{models.map(({ code, models: modelOptions }) => (
<QS models={code} key={code}>
python -m spacy download {modelOptions[0]}
</QS>
))}
{models.map(({ code, models: modelOptions }) => {
const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
return (
<QS models={code} key={code}>
python -m spacy download {pkg}
</QS>
)
})}
</Quickstart>
)
}}

View File

@ -31,25 +31,33 @@ const data = [
},
{
id: 'optimize',
title: 'Optimize for',
help:
'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
title: 'Select for',
options: [
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
{
id: 'efficiency',
title: 'efficiency',
checked: DEFAULT_OPT === 'efficiency',
help: 'Faster and smaller pipeline, but less accurate',
},
{
id: 'accuracy',
title: 'accuracy',
checked: DEFAULT_OPT === 'accuracy',
help: 'Larger and slower pipeline, but more accurate',
},
],
},
{
id: 'config',
title: 'Options',
multiple: true,
options: [{ id: 'example', title: 'Show usage example' }],
options: [{ id: 'example', title: 'Show text example' }],
},
]
const QuickstartInstall = ({ id, title, description, children }) => {
const [lang, setLang] = useState(DEFAULT_LANG)
const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
const setters = {
lang: setLang,
optimize: v => setEfficiency(v.includes('efficiency')),