mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
e8156d191f
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba pkuseg==0.0.25 pickle5 sudachipy sudachidict_core
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.0rc0 jieba spacy-pkuseg==0.0.26 sudachipy sudachidict_core
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
|
|
|
@ -7,7 +7,7 @@ requires = [
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a43,<8.0.0a50",
|
"thinc>=8.0.0a43,<8.0.0a50",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
]
|
]
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a43,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets==0.2.0a0
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
|
|
|
@ -41,7 +41,7 @@ install_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a43,<8.0.0a50
|
thinc>=8.0.0a43,<8.0.0a50
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.3.0,<3.0.0
|
srsly>=2.3.0,<3.0.0
|
||||||
catalogue>=2.0.1,<2.1.0
|
catalogue>=2.0.1,<2.1.0
|
||||||
|
@ -92,6 +92,8 @@ ko =
|
||||||
natto-py==0.9.0
|
natto-py==0.9.0
|
||||||
th =
|
th =
|
||||||
pythainlp>=2.0
|
pythainlp>=2.0
|
||||||
|
zh =
|
||||||
|
spacy-pkuseg==0.0.26
|
||||||
|
|
||||||
[bdist_wheel]
|
[bdist_wheel]
|
||||||
universal = false
|
universal = false
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a32"
|
__version__ = "3.0.0a34"
|
||||||
__release__ = True
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
|
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, TYPE_CHECKING
|
||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -16,7 +16,8 @@ import os
|
||||||
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
from ..schemas import ProjectConfigSchema, validate
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||||
from ..util import ENV_VARS
|
from ..util import is_compatible_version, ENV_VARS
|
||||||
|
from .. import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -142,6 +143,7 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]:
|
||||||
msg.fail(invalid_err)
|
msg.fail(invalid_err)
|
||||||
print("\n".join(errors))
|
print("\n".join(errors))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
validate_project_version(config)
|
||||||
validate_project_commands(config)
|
validate_project_commands(config)
|
||||||
# Make sure directories defined in config exist
|
# Make sure directories defined in config exist
|
||||||
for subdir in config.get("directories", []):
|
for subdir in config.get("directories", []):
|
||||||
|
@ -167,6 +169,23 @@ def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}):
|
||||||
return dict(interpolated["project"])
|
return dict(interpolated["project"])
|
||||||
|
|
||||||
|
|
||||||
|
def validate_project_version(config: Dict[str, Any]) -> None:
|
||||||
|
"""If the project defines a compatible spaCy version range, chec that it's
|
||||||
|
compatible with the current version of spaCy.
|
||||||
|
|
||||||
|
config (Dict[str, Any]): The loaded config.
|
||||||
|
"""
|
||||||
|
spacy_version = config.get("spacy_version", None)
|
||||||
|
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
||||||
|
err = (
|
||||||
|
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
||||||
|
f"that's not compatible with the version of spaCy you're running "
|
||||||
|
f"({about.__version__}). You can edit version requirement in the "
|
||||||
|
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
||||||
|
)
|
||||||
|
msg.fail(err, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
"""Check that project commands and workflows are valid, don't contain
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
duplicates, don't clash and only refer to commands that exist.
|
||||||
|
@ -193,12 +212,15 @@ def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data) -> str:
|
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
||||||
"""Get the hash for a JSON-serializable object.
|
"""Get the hash for a JSON-serializable object.
|
||||||
|
|
||||||
data: The data to hash.
|
data: The data to hash.
|
||||||
|
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
||||||
RETURNS (str): The hash.
|
RETURNS (str): The hash.
|
||||||
"""
|
"""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
data = {k: v for k, v in data.items() if k not in exclude}
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||||
return hashlib.md5(data_str).hexdigest()
|
return hashlib.md5(data_str).hexdigest()
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,9 @@ import tarfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
from .._util import get_hash, get_checksum, download_file, ensure_pathy
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir, get_minor_version, ENV_VARS, check_bool_env_var
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
|
from ... import about
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import Pathy # noqa: F401
|
from pathy import Pathy # noqa: F401
|
||||||
|
@ -129,7 +131,10 @@ def get_command_hash(
|
||||||
currently installed packages, whatever environment variables have been marked
|
currently installed packages, whatever environment variables have been marked
|
||||||
as relevant, and the command.
|
as relevant, and the command.
|
||||||
"""
|
"""
|
||||||
hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)]
|
check_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
|
spacy_v = GIT_VERSION if check_commit else get_minor_version(about.__version__)
|
||||||
|
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
||||||
|
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
||||||
hashes.extend(cmd)
|
hashes.extend(cmd)
|
||||||
creation_bytes = "".join(hashes).encode("utf8")
|
creation_bytes = "".join(hashes).encode("utf8")
|
||||||
return hashlib.md5(creation_bytes).hexdigest()
|
return hashlib.md5(creation_bytes).hexdigest()
|
||||||
|
|
|
@ -4,8 +4,11 @@ from wasabi import msg
|
||||||
import sys
|
import sys
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from ... import about
|
||||||
|
from ...git_info import GIT_VERSION
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||||
from ...util import SimpleFrozenList
|
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||||
|
from ...util import check_bool_env_var
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
||||||
|
|
||||||
|
@ -62,12 +65,13 @@ def project_run(
|
||||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
||||||
err_kwargs = {"exits": 1} if not dry else {}
|
err_kwargs = {"exits": 1} if not dry else {}
|
||||||
msg.fail(err, err_help, **err_kwargs)
|
msg.fail(err, err_help, **err_kwargs)
|
||||||
|
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
||||||
with working_dir(project_dir) as current_dir:
|
with working_dir(project_dir) as current_dir:
|
||||||
rerun = check_rerun(current_dir, cmd)
|
msg.divider(subcommand)
|
||||||
|
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
||||||
if not rerun and not force:
|
if not rerun and not force:
|
||||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||||
else:
|
else:
|
||||||
msg.divider(subcommand)
|
|
||||||
run_commands(cmd["script"], dry=dry)
|
run_commands(cmd["script"], dry=dry)
|
||||||
if not dry:
|
if not dry:
|
||||||
update_lockfile(current_dir, cmd)
|
update_lockfile(current_dir, cmd)
|
||||||
|
@ -171,12 +175,19 @@ def validate_subcommand(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
def check_rerun(
|
||||||
|
project_dir: Path,
|
||||||
|
command: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
check_spacy_version: bool = True,
|
||||||
|
check_spacy_commit: bool = False,
|
||||||
|
) -> bool:
|
||||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||||
changed.
|
changed.
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
project_dir (Path): The current project directory.
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||||
|
strict_version (bool):
|
||||||
RETURNS (bool): Whether to re-run the command.
|
RETURNS (bool): Whether to re-run the command.
|
||||||
"""
|
"""
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
lock_path = project_dir / PROJECT_LOCK
|
||||||
|
@ -189,10 +200,23 @@ def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool:
|
||||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||||
if not entry.get("outs", []):
|
if not entry.get("outs", []):
|
||||||
return True
|
return True
|
||||||
|
# Always rerun if spaCy version or commit hash changed
|
||||||
|
spacy_v = entry.get("spacy_version")
|
||||||
|
commit = entry.get("spacy_git_version")
|
||||||
|
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
||||||
|
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
||||||
|
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
||||||
|
return True
|
||||||
|
if check_spacy_commit and commit != GIT_VERSION:
|
||||||
|
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
||||||
|
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
||||||
|
return True
|
||||||
# If the entry in the lockfile matches the lockfile entry that would be
|
# If the entry in the lockfile matches the lockfile entry that would be
|
||||||
# generated from the current command, we don't rerun because it means that
|
# generated from the current command, we don't rerun because it means that
|
||||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||||
return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry)
|
lock_entry = get_lock_entry(project_dir, command)
|
||||||
|
exclude = ["spacy_version", "spacy_git_version"]
|
||||||
|
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
||||||
|
|
||||||
|
|
||||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
||||||
|
@ -231,6 +255,8 @@ def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]
|
||||||
"script": command["script"],
|
"script": command["script"],
|
||||||
"deps": deps,
|
"deps": deps,
|
||||||
"outs": [*outs, *outs_nc],
|
"outs": [*outs, *outs_nc],
|
||||||
|
"spacy_version": about.__version__,
|
||||||
|
"spacy_git_version": GIT_VERSION,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -171,9 +171,14 @@ factory = "tok2vec"
|
||||||
[components.tok2vec.model.embed]
|
[components.tok2vec.model.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
{% if has_letters -%}
|
||||||
also_embed_subwords = {{ "true" if has_letters else "false" }}
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
rows = [5000, 2500, 2500, 2500]
|
||||||
|
{% else -%}
|
||||||
|
attrs = ["ORTH", "SHAPE"]
|
||||||
|
rows = [5000, 2500]
|
||||||
|
{% endif -%}
|
||||||
|
include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }}
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
[components.tok2vec.model.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
|
@ -456,10 +456,14 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
|
E901 = ("Failed to remove existing output directory: {path}. If your "
|
||||||
|
"config and the components you train change between runs, a "
|
||||||
|
"non-empty output directory can lead to stale pipeline data. To "
|
||||||
|
"solve this, remove the existing directories in the output directory.")
|
||||||
|
E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. "
|
||||||
"Try checking whitespace and delimiters. See "
|
"Try checking whitespace and delimiters. See "
|
||||||
"https://nightly.spacy.io/api/cli#convert")
|
"https://nightly.spacy.io/api/cli#convert")
|
||||||
E093 = ("The token-per-line NER file is not formatted correctly. Try checking "
|
E903 = ("The token-per-line NER file is not formatted correctly. Try checking "
|
||||||
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
|
"whitespace and delimiters. See https://nightly.spacy.io/api/cli#convert")
|
||||||
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
|
E904 = ("Cannot initialize StaticVectors layer: nO dimension unset. This "
|
||||||
"dimension refers to the output width, after the linear projection "
|
"dimension refers to the output width, after the linear projection "
|
||||||
|
|
|
@ -25,8 +25,14 @@ class Russian(Language):
|
||||||
default_config={"model": None, "mode": "pymorphy2"},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
def make_lemmatizer(
|
||||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
|
nlp: Language,
|
||||||
|
model: Optional[Model],
|
||||||
|
name: str,
|
||||||
|
mode: str,
|
||||||
|
overwrite: bool = False,
|
||||||
|
):
|
||||||
|
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Russian"]
|
__all__ = ["Russian"]
|
||||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Tuple
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...symbols import POS
|
from ...symbols import POS
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
@ -22,9 +21,9 @@ class RussianLemmatizer(Lemmatizer):
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
lookups: Optional[Lookups] = None,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
|
|
|
@ -26,8 +26,10 @@ class Ukrainian(Language):
|
||||||
default_config={"model": None, "mode": "pymorphy2"},
|
default_config={"model": None, "mode": "pymorphy2"},
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
)
|
)
|
||||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
def make_lemmatizer(
|
||||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
|
nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
|
||||||
|
):
|
||||||
|
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Ukrainian"]
|
__all__ = ["Ukrainian"]
|
||||||
|
|
|
@ -3,7 +3,6 @@ from typing import Optional
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from ..ru.lemmatizer import RussianLemmatizer
|
from ..ru.lemmatizer import RussianLemmatizer
|
||||||
from ...lookups import Lookups
|
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,9 +14,9 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
||||||
name: str = "lemmatizer",
|
name: str = "lemmatizer",
|
||||||
*,
|
*,
|
||||||
mode: str = "pymorphy2",
|
mode: str = "pymorphy2",
|
||||||
lookups: Optional[Lookups] = None,
|
overwrite: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(vocab, model, name, mode=mode, lookups=lookups)
|
super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
try:
|
try:
|
||||||
from pymorphy2 import MorphAnalyzer
|
from pymorphy2 import MorphAnalyzer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
@ -17,8 +17,7 @@ from ... import util
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
_PKUSEG_INSTALL_MSG = "install pkuseg and pickle5 with `pip install pkuseg==0.0.25 pickle5`"
|
_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
|
||||||
_PKUSEG_PICKLE_WARNING = "Failed to force pkuseg model to use pickle protocol 4. If you're saving this model with python 3.8, it may not work with python 3.6-3.7."
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
DEFAULT_CONFIG = """
|
DEFAULT_CONFIG = """
|
||||||
|
@ -55,9 +54,7 @@ def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
|
|
||||||
|
|
||||||
class ChineseTokenizer(DummyTokenizer):
|
class ChineseTokenizer(DummyTokenizer):
|
||||||
def __init__(
|
def __init__(self, nlp: Language, segmenter: Segmenter = Segmenter.char):
|
||||||
self, nlp: Language, segmenter: Segmenter = Segmenter.char,
|
|
||||||
):
|
|
||||||
self.vocab = nlp.vocab
|
self.vocab = nlp.vocab
|
||||||
if isinstance(segmenter, Segmenter):
|
if isinstance(segmenter, Segmenter):
|
||||||
segmenter = segmenter.value
|
segmenter = segmenter.value
|
||||||
|
@ -82,11 +79,13 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
*,
|
*,
|
||||||
nlp: Optional[Language] = None,
|
nlp: Optional[Language] = None,
|
||||||
pkuseg_model: Optional[str] = None,
|
pkuseg_model: Optional[str] = None,
|
||||||
pkuseg_user_dict: str = "default",
|
pkuseg_user_dict: Optional[str] = "default",
|
||||||
):
|
):
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
|
if pkuseg_user_dict is None:
|
||||||
|
pkuseg_user_dict = pkuseg_model
|
||||||
self.pkuseg_seg = try_pkuseg_import(
|
self.pkuseg_seg = try_pkuseg_import(
|
||||||
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict,
|
pkuseg_model=pkuseg_model, pkuseg_user_dict=pkuseg_user_dict
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, text: str) -> Doc:
|
def __call__(self, text: str) -> Doc:
|
||||||
|
@ -120,12 +119,12 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
if reset:
|
if reset:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(None)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = (
|
msg = (
|
||||||
"pkuseg not installed: unable to reset pkuseg "
|
"spacy_pkuseg not installed: unable to reset pkuseg "
|
||||||
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
"user dict. Please " + _PKUSEG_INSTALL_MSG
|
||||||
)
|
)
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
@ -156,23 +155,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
self.pkuseg_seg.feature_extractor.save(tempdir)
|
self.pkuseg_seg.feature_extractor.save(tempdir)
|
||||||
self.pkuseg_seg.model.save(tempdir)
|
self.pkuseg_seg.model.save(tempdir)
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
# pkuseg saves features.pkl with pickle.HIGHEST_PROTOCOL, which
|
with open(tempdir / "features.msgpack", "rb") as fileh:
|
||||||
# means that it will be saved with pickle protocol 5 with
|
|
||||||
# python 3.8, which can't be reloaded with python 3.6-3.7.
|
|
||||||
# To try to make the model compatible with python 3.6+, reload
|
|
||||||
# the data with pickle5 and convert it back to protocol 4.
|
|
||||||
try:
|
|
||||||
import pickle5
|
|
||||||
|
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
|
||||||
features = pickle5.load(fileh)
|
|
||||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
|
||||||
pickle5.dump(features, fileh, protocol=4)
|
|
||||||
except ImportError as e:
|
|
||||||
raise e
|
|
||||||
except Exception:
|
|
||||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
|
||||||
with open(tempdir / "features.pkl", "rb") as fileh:
|
|
||||||
pkuseg_features_b = fileh.read()
|
pkuseg_features_b = fileh.read()
|
||||||
with open(tempdir / "weights.npz", "rb") as fileh:
|
with open(tempdir / "weights.npz", "rb") as fileh:
|
||||||
pkuseg_weights_b = fileh.read()
|
pkuseg_weights_b = fileh.read()
|
||||||
|
@ -213,22 +196,22 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
with open(tempdir / "features.msgpack", "wb") as fileh:
|
||||||
fileh.write(pkuseg_data["features_b"])
|
fileh.write(pkuseg_data["features_b"])
|
||||||
with open(tempdir / "weights.npz", "wb") as fileh:
|
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||||
fileh.write(pkuseg_data["weights_b"])
|
fileh.write(pkuseg_data["weights_b"])
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"spacy-pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
) from None
|
) from None
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
self.pkuseg_seg = spacy_pkuseg.pkuseg(str(tempdir))
|
||||||
if pkuseg_data["processors_data"]:
|
if pkuseg_data["processors_data"]:
|
||||||
processors_data = pkuseg_data["processors_data"]
|
processors_data = pkuseg_data["processors_data"]
|
||||||
(user_dict, do_process, common_words, other_words) = processors_data
|
(user_dict, do_process, common_words, other_words) = processors_data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
@ -244,18 +227,6 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
path.mkdir(parents=True)
|
path.mkdir(parents=True)
|
||||||
self.pkuseg_seg.model.save(path)
|
self.pkuseg_seg.model.save(path)
|
||||||
self.pkuseg_seg.feature_extractor.save(path)
|
self.pkuseg_seg.feature_extractor.save(path)
|
||||||
# try to convert features.pkl to pickle protocol 4
|
|
||||||
try:
|
|
||||||
import pickle5
|
|
||||||
|
|
||||||
with open(path / "features.pkl", "rb") as fileh:
|
|
||||||
features = pickle5.load(fileh)
|
|
||||||
with open(path / "features.pkl", "wb") as fileh:
|
|
||||||
pickle5.dump(features, fileh, protocol=4)
|
|
||||||
except ImportError as e:
|
|
||||||
raise e
|
|
||||||
except Exception:
|
|
||||||
warnings.warn(_PKUSEG_PICKLE_WARNING)
|
|
||||||
|
|
||||||
def save_pkuseg_processors(path):
|
def save_pkuseg_processors(path):
|
||||||
if self.pkuseg_seg:
|
if self.pkuseg_seg:
|
||||||
|
@ -279,26 +250,26 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
|
|
||||||
def load_pkuseg_model(path):
|
def load_pkuseg_model(path):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pkuseg not installed. To use this model, "
|
"spacy-pkuseg not installed. To use this model, "
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
) from None
|
) from None
|
||||||
if path.exists():
|
if path.exists():
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(path)
|
self.pkuseg_seg = spacy_pkuseg.pkuseg(path)
|
||||||
|
|
||||||
def load_pkuseg_processors(path):
|
def load_pkuseg_processors(path):
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
raise ImportError(self._pkuseg_install_msg) from None
|
raise ImportError(self._pkuseg_install_msg) from None
|
||||||
if self.segmenter == Segmenter.pkuseg:
|
if self.segmenter == Segmenter.pkuseg:
|
||||||
data = srsly.read_msgpack(path)
|
data = srsly.read_msgpack(path)
|
||||||
(user_dict, do_process, common_words, other_words) = data
|
(user_dict, do_process, common_words, other_words) = data
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = spacy_pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
self.pkuseg_seg.postprocesser.other_words = set(other_words)
|
||||||
|
@ -341,12 +312,13 @@ def try_jieba_import() -> None:
|
||||||
|
|
||||||
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
def try_pkuseg_import(pkuseg_model: str, pkuseg_user_dict: str) -> None:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import spacy_pkuseg
|
||||||
|
|
||||||
return pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
msg = "spacy-pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
|
||||||
raise ImportError(msg) from None
|
raise ImportError(msg) from None
|
||||||
|
try:
|
||||||
|
return spacy_pkuseg.pkuseg(pkuseg_model, pkuseg_user_dict)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
msg = "Unable to load pkuseg model from: " + pkuseg_model
|
||||||
raise FileNotFoundError(msg) from None
|
raise FileNotFoundError(msg) from None
|
||||||
|
|
|
@ -289,13 +289,12 @@ class Lookups:
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
DOCS: https://nightly.spacy.io/api/lookups#to_disk
|
||||||
"""
|
"""
|
||||||
if len(self._tables):
|
path = ensure_path(path)
|
||||||
path = ensure_path(path)
|
if not path.exists():
|
||||||
if not path.exists():
|
path.mkdir()
|
||||||
path.mkdir()
|
filepath = path / filename
|
||||||
filepath = path / filename
|
with filepath.open("wb") as file_:
|
||||||
with filepath.open("wb") as file_:
|
file_.write(self.to_bytes())
|
||||||
file_.write(self.to_bytes())
|
|
||||||
|
|
||||||
def from_disk(
|
def from_disk(
|
||||||
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs
|
||||||
|
|
|
@ -11,7 +11,7 @@ from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
from ..featureextractor import FeatureExtractor
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
|
from ...attrs import intify_attr
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
|
@ -29,7 +29,7 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
window_size: int,
|
window_size: int,
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
subword_features: bool,
|
subword_features: bool,
|
||||||
pretrained_vectors: Optional[bool]
|
pretrained_vectors: Optional[bool],
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
"""Build spaCy's 'standard' tok2vec layer, which uses hash embedding
|
||||||
with subword features and a CNN with layer-normalized maxout.
|
with subword features and a CNN with layer-normalized maxout.
|
||||||
|
@ -54,12 +54,18 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
a language such as Chinese.
|
a language such as Chinese.
|
||||||
pretrained_vectors (bool): Whether to also use static vectors.
|
pretrained_vectors (bool): Whether to also use static vectors.
|
||||||
"""
|
"""
|
||||||
|
if subword_features:
|
||||||
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
|
row_sizes = [embed_size, embed_size // 2, embed_size // 2, embed_size // 2]
|
||||||
|
else:
|
||||||
|
attrs = ["NORM"]
|
||||||
|
row_sizes = [embed_size]
|
||||||
return build_Tok2Vec_model(
|
return build_Tok2Vec_model(
|
||||||
embed=MultiHashEmbed(
|
embed=MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=row_sizes,
|
||||||
also_embed_subwords=subword_features,
|
attrs=attrs,
|
||||||
also_use_static_vectors=bool(pretrained_vectors),
|
include_static_vectors=bool(pretrained_vectors),
|
||||||
),
|
),
|
||||||
encode=MaxoutWindowEncoder(
|
encode=MaxoutWindowEncoder(
|
||||||
width=width,
|
width=width,
|
||||||
|
@ -93,58 +99,59 @@ def build_Tok2Vec_model(
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
@registry.architectures.register("spacy.MultiHashEmbed.v1")
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
width: int,
|
||||||
|
attrs: List[Union[str, int]],
|
||||||
|
rows: List[int],
|
||||||
|
include_static_vectors: bool,
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
"""Construct an embedding layer that separately embeds a number of lexical
|
"""Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it
|
attributes using hash embedding, concatenates the results, and passes it
|
||||||
through a feed-forward subnetwork to build a mixed representations.
|
through a feed-forward subnetwork to build a mixed representations.
|
||||||
|
|
||||||
The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
|
The features used can be configured with the 'attrs' argument. The suggested
|
||||||
varying definitions depending on the Vocab of the Doc object passed in.
|
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||||
Vectors from pretrained static vectors can also be incorporated into the
|
account some subword information, without construction a fully character-based
|
||||||
concatenated representation.
|
representation. If pretrained vectors are available, they can be included in
|
||||||
|
the representation as well, with the vectors table will be kept static
|
||||||
|
(i.e. it's not updated).
|
||||||
|
|
||||||
|
The `width` parameter specifies the output width of the layer and the widths
|
||||||
|
of all embedding tables. If static vectors are included, a learned linear
|
||||||
|
layer is used to map the vectors to the specified width before concatenating
|
||||||
|
it with the other embedding outputs. A single Maxout layer is then used to
|
||||||
|
reduce the concatenated vectors to the final width.
|
||||||
|
|
||||||
|
The `rows` parameter controls the number of rows used by the `HashEmbed`
|
||||||
|
tables. The HashEmbed layer needs surprisingly few rows, due to its use of
|
||||||
|
the hashing trick. Generally between 2000 and 10000 rows is sufficient,
|
||||||
|
even for very large vocabularies. A number of rows must be specified for each
|
||||||
|
table, so the `rows` list must be of the same length as the `attrs` parameter.
|
||||||
|
|
||||||
width (int): The output width. Also used as the width of the embedding tables.
|
width (int): The output width. Also used as the width of the embedding tables.
|
||||||
Recommended values are between 64 and 300.
|
Recommended values are between 64 and 300.
|
||||||
rows (int): The number of rows for the embedding tables. Can be low, due
|
attrs (list of attr IDs): The token attributes to embed. A separate
|
||||||
to the hashing trick. Embeddings for prefix, suffix and word shape
|
embedding table will be constructed for each attribute.
|
||||||
use half as many rows. Recommended values are between 2000 and 10000.
|
rows (List[int]): The number of rows in the embedding tables. Must have the
|
||||||
also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
|
same length as attrs.
|
||||||
features in the embeddings. If not using these, you may need more
|
include_static_vectors (bool): Whether to also use static word vectors.
|
||||||
rows in your hash embeddings, as there will be increased chance of
|
|
||||||
collisions.
|
|
||||||
also_use_static_vectors (bool): Whether to also use static word vectors.
|
|
||||||
Requires a vectors table to be loaded in the Doc objects' vocab.
|
Requires a vectors table to be loaded in the Doc objects' vocab.
|
||||||
"""
|
"""
|
||||||
cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
|
if len(rows) != len(attrs):
|
||||||
|
raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}")
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(index):
|
||||||
nonlocal seed
|
nonlocal seed
|
||||||
seed += 1
|
seed += 1
|
||||||
return HashEmbed(
|
return HashEmbed(width, rows[index], column=index, seed=seed, dropout=0.0)
|
||||||
width,
|
|
||||||
rows if feature == LOWER else rows // 2,
|
|
||||||
column=cols.index(feature),
|
|
||||||
seed=seed,
|
|
||||||
dropout=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if also_embed_subwords:
|
embeddings = [make_hash_embed(i) for i in range(len(attrs))]
|
||||||
embeddings = [
|
concat_size = width * (len(embeddings) + include_static_vectors)
|
||||||
make_hash_embed(LOWER),
|
if include_static_vectors:
|
||||||
make_hash_embed(PREFIX),
|
|
||||||
make_hash_embed(SUFFIX),
|
|
||||||
make_hash_embed(SHAPE),
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
embeddings = [make_hash_embed(LOWER)]
|
|
||||||
concat_size = width * (len(embeddings) + also_use_static_vectors)
|
|
||||||
if also_use_static_vectors:
|
|
||||||
model = chain(
|
model = chain(
|
||||||
concatenate(
|
concatenate(
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor(cols),
|
FeatureExtractor(attrs),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
),
|
),
|
||||||
|
@ -155,7 +162,7 @@ def MultiHashEmbed(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = chain(
|
model = chain(
|
||||||
FeatureExtractor(cols),
|
FeatureExtractor(list(attrs)),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
|
||||||
|
|
|
@ -210,7 +210,7 @@ class Morphologizer(Tagger):
|
||||||
|
|
||||||
examples (Iterable[Examples]): The batch of examples.
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
|
DOCS: https://nightly.spacy.io/api/morphologizer#get_loss
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -162,7 +162,7 @@ cdef class Pipe:
|
||||||
|
|
||||||
examples (Iterable[Examples]): The batch of examples.
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
DOCS: https://nightly.spacy.io/api/pipe#get_loss
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -104,7 +104,7 @@ class SentenceRecognizer(Tagger):
|
||||||
|
|
||||||
examples (Iterable[Examples]): The batch of examples.
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#get_loss
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -249,7 +249,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
examples (Iterable[Examples]): The batch of examples.
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
DOCS: https://nightly.spacy.io/api/tagger#get_loss
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -281,7 +281,7 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
examples (Iterable[Examples]): The batch of examples.
|
examples (Iterable[Examples]): The batch of examples.
|
||||||
scores: Scores representing the model's predictions.
|
scores: Scores representing the model's predictions.
|
||||||
RETUTNRS (Tuple[float, float]): The loss and the gradient.
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
|
DOCS: https://nightly.spacy.io/api/textcategorizer#get_loss
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -448,6 +448,7 @@ class ProjectConfigSchema(BaseModel):
|
||||||
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order")
|
||||||
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts")
|
||||||
title: Optional[str] = Field(None, title="Project title")
|
title: Optional[str] = Field(None, title="Project title")
|
||||||
|
spacy_version: Optional[StrictStr] = Field(None, title="spaCy version range that the project is compatible with")
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
|
|
|
@ -248,7 +248,6 @@ def tt_tokenizer():
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def uk_tokenizer():
|
def uk_tokenizer():
|
||||||
pytest.importorskip("pymorphy2")
|
pytest.importorskip("pymorphy2")
|
||||||
pytest.importorskip("pymorphy2.lang")
|
|
||||||
return get_lang_class("uk")().tokenizer
|
return get_lang_class("uk")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -285,8 +284,7 @@ def zh_tokenizer_jieba():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def zh_tokenizer_pkuseg():
|
def zh_tokenizer_pkuseg():
|
||||||
pytest.importorskip("pkuseg")
|
pytest.importorskip("spacy_pkuseg")
|
||||||
pytest.importorskip("pickle5")
|
|
||||||
config = {
|
config = {
|
||||||
"nlp": {
|
"nlp": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
|
@ -296,7 +294,7 @@ def zh_tokenizer_pkuseg():
|
||||||
},
|
},
|
||||||
"initialize": {
|
"initialize": {
|
||||||
"tokenizer": {
|
"tokenizer": {
|
||||||
"pkuseg_model": "default",
|
"pkuseg_model": "web",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -209,9 +209,13 @@ def test_doc_retokenizer_split_norm(en_vocab):
|
||||||
# Retokenize to split out the words in the token at doc[2].
|
# Retokenize to split out the words in the token at doc[2].
|
||||||
token = doc[2]
|
token = doc[2]
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.split(token, ["brown", "fox", "jumps", "over", "the"], heads=[(token, idx) for idx in range(5)])
|
retokenizer.split(
|
||||||
|
token,
|
||||||
|
["brown", "fox", "jumps", "over", "the"],
|
||||||
|
heads=[(token, idx) for idx in range(5)],
|
||||||
|
)
|
||||||
|
|
||||||
assert doc[9].text == "w/"
|
assert doc[9].text == "w/"
|
||||||
assert doc[9].norm_ == "with"
|
assert doc[9].norm_ == "with"
|
||||||
assert doc[5].text == "over"
|
assert doc[5].text == "over"
|
||||||
assert doc[5].norm_ == "over"
|
assert doc[5].norm_ == "over"
|
||||||
|
|
|
@ -350,7 +350,7 @@ def test_pipe_methods_frozen():
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"],
|
"pipe", ["tagger", "parser", "ner", "textcat", "morphologizer"]
|
||||||
)
|
)
|
||||||
def test_pipe_label_data_exports_labels(pipe):
|
def test_pipe_label_data_exports_labels(pipe):
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
|
|
@ -24,9 +24,9 @@ def test_empty_doc():
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
MultiHashEmbed(
|
MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=[embed_size, embed_size, embed_size, embed_size],
|
||||||
also_use_static_vectors=False,
|
include_static_vectors=False,
|
||||||
also_embed_subwords=True,
|
attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||||
)
|
)
|
||||||
|
@ -44,9 +44,9 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
MultiHashEmbed(
|
MultiHashEmbed(
|
||||||
width=width,
|
width=width,
|
||||||
rows=embed_size,
|
rows=[embed_size] * 4,
|
||||||
also_use_static_vectors=False,
|
include_static_vectors=False,
|
||||||
also_embed_subwords=True,
|
attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"],
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
|
||||||
)
|
)
|
||||||
|
@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"width,embed_arch,embed_config,encode_arch,encode_config",
|
"width,embed_arch,embed_config,encode_arch,encode_config",
|
||||||
[
|
[
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
|
||||||
(8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
|
||||||
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
|
||||||
],
|
],
|
||||||
|
@ -118,9 +118,9 @@ cfg_string = """
|
||||||
[components.tok2vec.model.embed]
|
[components.tok2vec.model.embed]
|
||||||
@architectures = "spacy.MultiHashEmbed.v1"
|
@architectures = "spacy.MultiHashEmbed.v1"
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
rows = 2000
|
rows = [2000, 1000, 1000, 1000]
|
||||||
also_embed_subwords = true
|
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
also_use_static_vectors = false
|
include_static_vectors = false
|
||||||
|
|
||||||
[components.tok2vec.model.encode]
|
[components.tok2vec.model.encode]
|
||||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import merge_entities
|
from spacy.pipeline import merge_entities
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue5918():
|
def test_issue5918():
|
||||||
|
@ -23,7 +22,8 @@ def test_issue5918():
|
||||||
assert len(doc.ents) == 3
|
assert len(doc.ents) == 3
|
||||||
# make it so that the third span's head is within the entity (ent_iob=I)
|
# make it so that the third span's head is within the entity (ent_iob=I)
|
||||||
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
|
# bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents.
|
||||||
with pytest.warns(UserWarning):
|
# TODO: test for logging here
|
||||||
doc[29].head = doc[33]
|
# with pytest.warns(UserWarning):
|
||||||
|
# doc[29].head = doc[33]
|
||||||
doc = merge_entities(doc)
|
doc = merge_entities(doc)
|
||||||
assert len(doc.ents) == 3
|
assert len(doc.ents) == 3
|
||||||
|
|
|
@ -89,9 +89,9 @@ def my_parser():
|
||||||
tok2vec = build_Tok2Vec_model(
|
tok2vec = build_Tok2Vec_model(
|
||||||
MultiHashEmbed(
|
MultiHashEmbed(
|
||||||
width=321,
|
width=321,
|
||||||
rows=5432,
|
attrs=["LOWER", "SHAPE"],
|
||||||
also_embed_subwords=True,
|
rows=[5432, 5432],
|
||||||
also_use_static_vectors=False,
|
include_static_vectors=False,
|
||||||
),
|
),
|
||||||
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
||||||
)
|
)
|
||||||
|
|
|
@ -7,6 +7,15 @@ from spacy import util
|
||||||
from spacy import prefer_gpu, require_gpu
|
from spacy import prefer_gpu, require_gpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||||
|
from spacy.util import dot_to_object, SimpleFrozenList
|
||||||
|
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||||
|
from spacy.training.batchers import minibatch_by_words
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.lang.nl import Dutch
|
||||||
|
from spacy.language import DEFAULT_CONFIG_PATH
|
||||||
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
|
|
||||||
|
from .util import get_random_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -140,6 +149,21 @@ def test_is_unconstrained_version(constraint, expected):
|
||||||
assert util.is_unconstrained_version(constraint) is expected
|
assert util.is_unconstrained_version(constraint) is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"a1,a2,b1,b2,is_match",
|
||||||
|
[
|
||||||
|
("3.0.0", "3.0", "3.0.1", "3.0", True),
|
||||||
|
("3.1.0", "3.1", "3.2.1", "3.2", False),
|
||||||
|
("xxx", None, "1.2.3.dev0", "1.2", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_minor_version(a1, a2, b1, b2, is_match):
|
||||||
|
assert util.get_minor_version(a1) == a2
|
||||||
|
assert util.get_minor_version(b1) == b2
|
||||||
|
assert util.is_minor_version_match(a1, b1) is is_match
|
||||||
|
assert util.is_minor_version_match(a2, b2) is is_match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"dot_notation,expected",
|
"dot_notation,expected",
|
||||||
[
|
[
|
||||||
|
@ -157,3 +181,128 @@ def test_dot_to_dict(dot_notation, expected):
|
||||||
result = util.dot_to_dict(dot_notation)
|
result = util.dot_to_dict(dot_notation)
|
||||||
assert result == expected
|
assert result == expected
|
||||||
assert util.dict_to_dot(result) == dot_notation
|
assert util.dict_to_dot(result) == dot_notation
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"doc_sizes, expected_batches",
|
||||||
|
[
|
||||||
|
([400, 400, 199], [3]),
|
||||||
|
([400, 400, 199, 3], [4]),
|
||||||
|
([400, 400, 199, 3, 200], [3, 2]),
|
||||||
|
([400, 400, 199, 3, 1], [5]),
|
||||||
|
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
||||||
|
([400, 400, 199, 3, 1, 200], [3, 3]),
|
||||||
|
([400, 400, 199, 3, 1, 999], [3, 3]),
|
||||||
|
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
||||||
|
([1, 2, 999], [3]),
|
||||||
|
([1, 2, 999, 1], [4]),
|
||||||
|
([1, 200, 999, 1], [2, 2]),
|
||||||
|
([1, 999, 200, 1], [2, 2]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_util_minibatch(doc_sizes, expected_batches):
|
||||||
|
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
||||||
|
tol = 0.2
|
||||||
|
batch_size = 1000
|
||||||
|
batches = list(
|
||||||
|
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
|
||||||
|
)
|
||||||
|
assert [len(batch) for batch in batches] == expected_batches
|
||||||
|
|
||||||
|
max_size = batch_size + batch_size * tol
|
||||||
|
for batch in batches:
|
||||||
|
assert sum([len(doc) for doc in batch]) < max_size
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"doc_sizes, expected_batches",
|
||||||
|
[
|
||||||
|
([400, 4000, 199], [1, 2]),
|
||||||
|
([400, 400, 199, 3000, 200], [1, 4]),
|
||||||
|
([400, 400, 199, 3, 1, 1500], [1, 5]),
|
||||||
|
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
|
||||||
|
([1, 2, 9999], [1, 2]),
|
||||||
|
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
||||||
|
""" Test that oversized documents are returned in their own batch"""
|
||||||
|
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
||||||
|
tol = 0.2
|
||||||
|
batch_size = 1000
|
||||||
|
batches = list(
|
||||||
|
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
|
||||||
|
)
|
||||||
|
assert [len(batch) for batch in batches] == expected_batches
|
||||||
|
|
||||||
|
|
||||||
|
def test_util_dot_section():
|
||||||
|
cfg_string = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["textcat"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
|
||||||
|
[components.textcat.model]
|
||||||
|
@architectures = "spacy.TextCatBOW.v1"
|
||||||
|
exclusive_classes = true
|
||||||
|
ngram_size = 1
|
||||||
|
no_output_layer = false
|
||||||
|
"""
|
||||||
|
nlp_config = Config().from_str(cfg_string)
|
||||||
|
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
|
||||||
|
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
||||||
|
default_config["nlp"]["lang"] = "nl"
|
||||||
|
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
|
||||||
|
# Test that creation went OK
|
||||||
|
assert isinstance(en_nlp, English)
|
||||||
|
assert isinstance(nl_nlp, Dutch)
|
||||||
|
assert nl_nlp.pipe_names == []
|
||||||
|
assert en_nlp.pipe_names == ["textcat"]
|
||||||
|
# not exclusive_classes
|
||||||
|
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
||||||
|
# Test that default values got overwritten
|
||||||
|
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
|
||||||
|
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
|
||||||
|
# Test proper functioning of 'dot_to_object'
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
dot_to_object(en_nlp.config, "nlp.unknownattribute")
|
||||||
|
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
|
||||||
|
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_frozen_list():
|
||||||
|
t = SimpleFrozenList(["foo", "bar"])
|
||||||
|
assert t == ["foo", "bar"]
|
||||||
|
assert t.index("bar") == 1 # okay method
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
t.append("baz")
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
t.sort()
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
t.extend(["baz"])
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
t.pop()
|
||||||
|
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
t.append("baz")
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_dot_names():
|
||||||
|
config = {
|
||||||
|
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
|
||||||
|
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
|
||||||
|
}
|
||||||
|
result = util.resolve_dot_names(config, ["training.optimizer"])
|
||||||
|
assert isinstance(result[0], Optimizer)
|
||||||
|
with pytest.raises(ConfigValidationError) as e:
|
||||||
|
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
|
||||||
|
errors = e.value.errors
|
||||||
|
assert len(errors) == 1
|
||||||
|
assert errors[0]["loc"] == ["training", "xyz"]
|
||||||
|
|
|
@ -61,7 +61,10 @@ def get_tok2vec_kwargs():
|
||||||
# This actually creates models, so seems best to put it in a function.
|
# This actually creates models, so seems best to put it in a function.
|
||||||
return {
|
return {
|
||||||
"embed": MultiHashEmbed(
|
"embed": MultiHashEmbed(
|
||||||
width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
|
width=32,
|
||||||
|
rows=[500, 500, 500],
|
||||||
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
|
include_static_vectors=False,
|
||||||
),
|
),
|
||||||
"encode": MaxoutWindowEncoder(
|
"encode": MaxoutWindowEncoder(
|
||||||
width=32, depth=2, maxout_pieces=2, window_size=1
|
width=32, depth=2, maxout_pieces=2, window_size=1
|
||||||
|
@ -73,6 +76,32 @@ def test_tok2vec():
|
||||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_hash_embed():
|
||||||
|
embed = MultiHashEmbed(
|
||||||
|
width=32,
|
||||||
|
rows=[500, 500, 500],
|
||||||
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
|
include_static_vectors=False,
|
||||||
|
)
|
||||||
|
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
||||||
|
assert len(hash_embeds) == 3
|
||||||
|
# Check they look at different columns.
|
||||||
|
assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
|
||||||
|
# Check they use different seeds
|
||||||
|
assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
|
||||||
|
# Check they all have the same number of rows
|
||||||
|
assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
|
||||||
|
# Now try with different row factors
|
||||||
|
embed = MultiHashEmbed(
|
||||||
|
width=32,
|
||||||
|
rows=[1000, 50, 250],
|
||||||
|
attrs=["NORM", "PREFIX", "SHAPE"],
|
||||||
|
include_static_vectors=False,
|
||||||
|
)
|
||||||
|
hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
|
||||||
|
assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"seed,model_func,kwargs",
|
"seed,model_func,kwargs",
|
||||||
[
|
[
|
||||||
|
|
|
@ -1,137 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy import util
|
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList
|
|
||||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
|
||||||
from spacy.training.batchers import minibatch_by_words
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.lang.nl import Dutch
|
|
||||||
from spacy.language import DEFAULT_CONFIG_PATH
|
|
||||||
from spacy.schemas import ConfigSchemaTraining
|
|
||||||
|
|
||||||
from .util import get_random_doc
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"doc_sizes, expected_batches",
|
|
||||||
[
|
|
||||||
([400, 400, 199], [3]),
|
|
||||||
([400, 400, 199, 3], [4]),
|
|
||||||
([400, 400, 199, 3, 200], [3, 2]),
|
|
||||||
([400, 400, 199, 3, 1], [5]),
|
|
||||||
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
|
||||||
([400, 400, 199, 3, 1, 200], [3, 3]),
|
|
||||||
([400, 400, 199, 3, 1, 999], [3, 3]),
|
|
||||||
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
|
||||||
([1, 2, 999], [3]),
|
|
||||||
([1, 2, 999, 1], [4]),
|
|
||||||
([1, 200, 999, 1], [2, 2]),
|
|
||||||
([1, 999, 200, 1], [2, 2]),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_util_minibatch(doc_sizes, expected_batches):
|
|
||||||
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
|
||||||
tol = 0.2
|
|
||||||
batch_size = 1000
|
|
||||||
batches = list(
|
|
||||||
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=True)
|
|
||||||
)
|
|
||||||
assert [len(batch) for batch in batches] == expected_batches
|
|
||||||
|
|
||||||
max_size = batch_size + batch_size * tol
|
|
||||||
for batch in batches:
|
|
||||||
assert sum([len(doc) for doc in batch]) < max_size
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"doc_sizes, expected_batches",
|
|
||||||
[
|
|
||||||
([400, 4000, 199], [1, 2]),
|
|
||||||
([400, 400, 199, 3000, 200], [1, 4]),
|
|
||||||
([400, 400, 199, 3, 1, 1500], [1, 5]),
|
|
||||||
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
|
|
||||||
([1, 2, 9999], [1, 2]),
|
|
||||||
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
|
||||||
""" Test that oversized documents are returned in their own batch"""
|
|
||||||
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
|
||||||
tol = 0.2
|
|
||||||
batch_size = 1000
|
|
||||||
batches = list(
|
|
||||||
minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False)
|
|
||||||
)
|
|
||||||
assert [len(batch) for batch in batches] == expected_batches
|
|
||||||
|
|
||||||
|
|
||||||
def test_util_dot_section():
|
|
||||||
cfg_string = """
|
|
||||||
[nlp]
|
|
||||||
lang = "en"
|
|
||||||
pipeline = ["textcat"]
|
|
||||||
|
|
||||||
[components]
|
|
||||||
|
|
||||||
[components.textcat]
|
|
||||||
factory = "textcat"
|
|
||||||
|
|
||||||
[components.textcat.model]
|
|
||||||
@architectures = "spacy.TextCatBOW.v1"
|
|
||||||
exclusive_classes = true
|
|
||||||
ngram_size = 1
|
|
||||||
no_output_layer = false
|
|
||||||
"""
|
|
||||||
nlp_config = Config().from_str(cfg_string)
|
|
||||||
en_nlp = util.load_model_from_config(nlp_config, auto_fill=True)
|
|
||||||
default_config = Config().from_disk(DEFAULT_CONFIG_PATH)
|
|
||||||
default_config["nlp"]["lang"] = "nl"
|
|
||||||
nl_nlp = util.load_model_from_config(default_config, auto_fill=True)
|
|
||||||
# Test that creation went OK
|
|
||||||
assert isinstance(en_nlp, English)
|
|
||||||
assert isinstance(nl_nlp, Dutch)
|
|
||||||
assert nl_nlp.pipe_names == []
|
|
||||||
assert en_nlp.pipe_names == ["textcat"]
|
|
||||||
# not exclusive_classes
|
|
||||||
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
|
|
||||||
# Test that default values got overwritten
|
|
||||||
assert en_nlp.config["nlp"]["pipeline"] == ["textcat"]
|
|
||||||
assert nl_nlp.config["nlp"]["pipeline"] == [] # default value []
|
|
||||||
# Test proper functioning of 'dot_to_object'
|
|
||||||
with pytest.raises(KeyError):
|
|
||||||
dot_to_object(en_nlp.config, "nlp.pipeline.tagger")
|
|
||||||
with pytest.raises(KeyError):
|
|
||||||
dot_to_object(en_nlp.config, "nlp.unknownattribute")
|
|
||||||
T = util.registry.resolve(nl_nlp.config["training"], schema=ConfigSchemaTraining)
|
|
||||||
assert isinstance(dot_to_object({"training": T}, "training.optimizer"), Optimizer)
|
|
||||||
|
|
||||||
|
|
||||||
def test_simple_frozen_list():
|
|
||||||
t = SimpleFrozenList(["foo", "bar"])
|
|
||||||
assert t == ["foo", "bar"]
|
|
||||||
assert t.index("bar") == 1 # okay method
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
t.append("baz")
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
t.sort()
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
t.extend(["baz"])
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
t.pop()
|
|
||||||
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
|
||||||
with pytest.raises(NotImplementedError):
|
|
||||||
t.append("baz")
|
|
||||||
|
|
||||||
|
|
||||||
def test_resolve_dot_names():
|
|
||||||
config = {
|
|
||||||
"training": {"optimizer": {"@optimizers": "Adam.v1"}},
|
|
||||||
"foo": {"bar": "training.optimizer", "baz": "training.xyz"},
|
|
||||||
}
|
|
||||||
result = util.resolve_dot_names(config, ["training.optimizer"])
|
|
||||||
assert isinstance(result[0], Optimizer)
|
|
||||||
with pytest.raises(ConfigValidationError) as e:
|
|
||||||
util.resolve_dot_names(config, ["training.xyz", "training.optimizer"])
|
|
||||||
errors = e.value.errors
|
|
||||||
assert len(errors) == 1
|
|
||||||
assert errors[0]["loc"] == ["training", "xyz"]
|
|
|
@ -1528,7 +1528,7 @@ cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||||
while not heads_within_sents:
|
while not heads_within_sents:
|
||||||
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
|
heads_within_sents = _set_lr_kids_and_edges(tokens, start, end, loop_count)
|
||||||
if loop_count > 10:
|
if loop_count > 10:
|
||||||
warnings.warn(Warnings.W026)
|
util.logger.debug(Warnings.W026)
|
||||||
break
|
break
|
||||||
loop_count += 1
|
loop_count += 1
|
||||||
# Set sentence starts
|
# Set sentence starts
|
||||||
|
|
|
@ -5,7 +5,7 @@ import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pydantic import BaseModel, StrictStr
|
from pydantic import BaseModel, StrictStr
|
||||||
|
|
||||||
from ..util import registry, logger
|
from ..util import registry
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .example import Example
|
from .example import Example
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ def dont_augment(nlp: "Language", example: Example) -> Iterator[Example]:
|
||||||
|
|
||||||
|
|
||||||
def lower_casing_augmenter(
|
def lower_casing_augmenter(
|
||||||
nlp: "Language", example: Example, *, level: float,
|
nlp: "Language", example: Example, *, level: float
|
||||||
) -> Iterator[Example]:
|
) -> Iterator[Example]:
|
||||||
if random.random() >= level:
|
if random.random() >= level:
|
||||||
yield example
|
yield example
|
||||||
|
@ -119,9 +119,8 @@ def make_orth_variants(
|
||||||
orig_token_dict = copy.deepcopy(token_dict)
|
orig_token_dict = copy.deepcopy(token_dict)
|
||||||
ndsv = orth_variants.get("single", [])
|
ndsv = orth_variants.get("single", [])
|
||||||
ndpv = orth_variants.get("paired", [])
|
ndpv = orth_variants.get("paired", [])
|
||||||
logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants")
|
words = token_dict.get("ORTH", [])
|
||||||
words = token_dict.get("words", [])
|
tags = token_dict.get("TAG", [])
|
||||||
tags = token_dict.get("tags", [])
|
|
||||||
# keep unmodified if words or tags are not defined
|
# keep unmodified if words or tags are not defined
|
||||||
if words and tags:
|
if words and tags:
|
||||||
if lower:
|
if lower:
|
||||||
|
@ -154,8 +153,8 @@ def make_orth_variants(
|
||||||
if words[word_idx] in pair:
|
if words[word_idx] in pair:
|
||||||
pair_idx = pair.index(words[word_idx])
|
pair_idx = pair.index(words[word_idx])
|
||||||
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||||
token_dict["words"] = words
|
token_dict["ORTH"] = words
|
||||||
token_dict["tags"] = tags
|
token_dict["TAG"] = tags
|
||||||
# modify raw
|
# modify raw
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
variants = []
|
variants = []
|
||||||
|
|
|
@ -103,7 +103,7 @@ def conll_ner_to_docs(
|
||||||
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
|
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
|
||||||
cols = list(zip(*[line.split() for line in lines]))
|
cols = list(zip(*[line.split() for line in lines]))
|
||||||
if len(cols) < 2:
|
if len(cols) < 2:
|
||||||
raise ValueError(Errors.E093)
|
raise ValueError(Errors.E903)
|
||||||
length = len(cols[0])
|
length = len(cols[0])
|
||||||
words.extend(cols[0])
|
words.extend(cols[0])
|
||||||
sent_starts.extend([True] + [False] * (length - 1))
|
sent_starts.extend([True] + [False] * (length - 1))
|
||||||
|
|
|
@ -46,7 +46,7 @@ def read_iob(raw_sents, vocab, n_sents):
|
||||||
sent_words, sent_iob = zip(*sent_tokens)
|
sent_words, sent_iob = zip(*sent_tokens)
|
||||||
sent_tags = ["-"] * len(sent_words)
|
sent_tags = ["-"] * len(sent_words)
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E092)
|
raise ValueError(Errors.E902)
|
||||||
words.extend(sent_words)
|
words.extend(sent_words)
|
||||||
tags.extend(sent_tags)
|
tags.extend(sent_tags)
|
||||||
iob.extend(sent_iob)
|
iob.extend(sent_iob)
|
||||||
|
|
|
@ -3,19 +3,24 @@ from typing import Optional, TYPE_CHECKING
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
from thinc.api import Optimizer, Config, constant, fix_random_seed, set_gpu_allocator
|
||||||
|
from wasabi import Printer
|
||||||
import random
|
import random
|
||||||
import wasabi
|
|
||||||
import sys
|
import sys
|
||||||
|
import shutil
|
||||||
|
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import resolve_dot_names, registry
|
from ..util import resolve_dot_names, registry, logger
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..language import Language # noqa: F401
|
from ..language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
DIR_MODEL_BEST = "model-best"
|
||||||
|
DIR_MODEL_LAST = "model-last"
|
||||||
|
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
nlp: "Language",
|
nlp: "Language",
|
||||||
output_path: Optional[Path] = None,
|
output_path: Optional[Path] = None,
|
||||||
|
@ -38,7 +43,7 @@ def train(
|
||||||
RETURNS (Path / None): The path to the final exported model.
|
RETURNS (Path / None): The path to the final exported model.
|
||||||
"""
|
"""
|
||||||
# We use no_print here so we can respect the stdout/stderr options.
|
# We use no_print here so we can respect the stdout/stderr options.
|
||||||
msg = wasabi.Printer(no_print=True)
|
msg = Printer(no_print=True)
|
||||||
# Create iterator, which yields out info after each optimization step.
|
# Create iterator, which yields out info after each optimization step.
|
||||||
config = nlp.config.interpolate()
|
config = nlp.config.interpolate()
|
||||||
if config["training"]["seed"] is not None:
|
if config["training"]["seed"] is not None:
|
||||||
|
@ -69,6 +74,7 @@ def train(
|
||||||
eval_frequency=T["eval_frequency"],
|
eval_frequency=T["eval_frequency"],
|
||||||
exclude=frozen_components,
|
exclude=frozen_components,
|
||||||
)
|
)
|
||||||
|
clean_output_dir(output_path)
|
||||||
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
|
stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
|
||||||
if frozen_components:
|
if frozen_components:
|
||||||
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
|
stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
|
||||||
|
@ -83,7 +89,7 @@ def train(
|
||||||
update_meta(T, nlp, info)
|
update_meta(T, nlp, info)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
nlp = before_to_disk(nlp)
|
nlp = before_to_disk(nlp)
|
||||||
nlp.to_disk(output_path / "model-best")
|
nlp.to_disk(output_path / DIR_MODEL_BEST)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
# We don't want to swallow the traceback if we don't have a
|
# We don't want to swallow the traceback if we don't have a
|
||||||
|
@ -100,7 +106,7 @@ def train(
|
||||||
finally:
|
finally:
|
||||||
finalize_logger()
|
finalize_logger()
|
||||||
if output_path is not None:
|
if output_path is not None:
|
||||||
final_model_path = output_path / "model-last"
|
final_model_path = output_path / DIR_MODEL_LAST
|
||||||
if optimizer.averages:
|
if optimizer.averages:
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
|
@ -305,3 +311,19 @@ def create_before_to_disk_callback(
|
||||||
return modified_nlp
|
return modified_nlp
|
||||||
|
|
||||||
return before_to_disk
|
return before_to_disk
|
||||||
|
|
||||||
|
|
||||||
|
def clean_output_dir(path: Union[str, Path]) -> None:
|
||||||
|
"""Remove an existing output directory. Typically used to ensure that that
|
||||||
|
a directory like model-best and its contents aren't just being overwritten
|
||||||
|
by nlp.to_disk, which could preserve existing subdirectories (e.g.
|
||||||
|
components that don't exist anymore).
|
||||||
|
"""
|
||||||
|
if path is not None and path.exists():
|
||||||
|
for subdir in [path / DIR_MODEL_BEST, path / DIR_MODEL_LAST]:
|
||||||
|
if subdir.exists():
|
||||||
|
try:
|
||||||
|
shutil.rmtree(str(subdir))
|
||||||
|
logger.debug(f"Removed existing output directory: {subdir}")
|
||||||
|
except Exception as e:
|
||||||
|
raise IOError(Errors.E901.format(path=path)) from e
|
||||||
|
|
|
@ -73,6 +73,7 @@ logger = logging.getLogger("spacy")
|
||||||
|
|
||||||
class ENV_VARS:
|
class ENV_VARS:
|
||||||
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
|
CONFIG_OVERRIDES = "SPACY_CONFIG_OVERRIDES"
|
||||||
|
PROJECT_USE_GIT_VERSION = "SPACY_PROJECT_USE_GIT_VERSION"
|
||||||
|
|
||||||
|
|
||||||
class registry(thinc.registry):
|
class registry(thinc.registry):
|
||||||
|
@ -584,6 +585,33 @@ def get_base_version(version: str) -> str:
|
||||||
return Version(version).base_version
|
return Version(version).base_version
|
||||||
|
|
||||||
|
|
||||||
|
def get_minor_version(version: str) -> Optional[str]:
|
||||||
|
"""Get the major + minor version (without patch or prerelease identifiers).
|
||||||
|
|
||||||
|
version (str): The version.
|
||||||
|
RETURNS (str): The major + minor version or None if version is invalid.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
v = Version(version)
|
||||||
|
except (TypeError, InvalidVersion):
|
||||||
|
return None
|
||||||
|
return f"{v.major}.{v.minor}"
|
||||||
|
|
||||||
|
|
||||||
|
def is_minor_version_match(version_a: str, version_b: str) -> bool:
|
||||||
|
"""Compare two versions and check if they match in major and minor, without
|
||||||
|
patch or prerelease identifiers. Used internally for compatibility checks
|
||||||
|
that should be insensitive to patch releases.
|
||||||
|
|
||||||
|
version_a (str): The first version
|
||||||
|
version_b (str): The second version.
|
||||||
|
RETURNS (bool): Whether the versions match.
|
||||||
|
"""
|
||||||
|
a = get_minor_version(version_a)
|
||||||
|
b = get_minor_version(version_b)
|
||||||
|
return a is not None and b is not None and a == b
|
||||||
|
|
||||||
|
|
||||||
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
def load_meta(path: Union[str, Path]) -> Dict[str, Any]:
|
||||||
"""Load a model meta.json from a path and validate its contents.
|
"""Load a model meta.json from a path and validate its contents.
|
||||||
|
|
||||||
|
@ -1315,3 +1343,16 @@ def is_cython_func(func: Callable) -> bool:
|
||||||
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
|
||||||
return hasattr(cls_func, attr)
|
return hasattr(cls_func, attr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def check_bool_env_var(env_var: str) -> bool:
|
||||||
|
"""Convert the value of an environment variable to a boolean. Add special
|
||||||
|
check for "0" (falsy) and consider everything else truthy, except unset.
|
||||||
|
|
||||||
|
env_var (str): The name of the environment variable to check.
|
||||||
|
RETURNS (bool): Its boolean value.
|
||||||
|
"""
|
||||||
|
value = os.environ.get(env_var, False)
|
||||||
|
if value == "0":
|
||||||
|
return False
|
||||||
|
return bool(value)
|
||||||
|
|
|
@ -445,9 +445,9 @@ cdef class Vocab:
|
||||||
setters = ["strings", "vectors"]
|
setters = ["strings", "vectors"]
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.to_disk(path / "strings.json")
|
self.strings.to_disk(path / "strings.json")
|
||||||
if "vectors" not in "exclude" and self.vectors is not None:
|
if "vectors" not in "exclude":
|
||||||
self.vectors.to_disk(path)
|
self.vectors.to_disk(path)
|
||||||
if "lookups" not in "exclude" and self.lookups is not None:
|
if "lookups" not in "exclude":
|
||||||
self.lookups.to_disk(path)
|
self.lookups.to_disk(path)
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
|
|
|
@ -136,25 +136,28 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.MultiHashEmbed.v1"
|
> @architectures = "spacy.MultiHashEmbed.v1"
|
||||||
> width = 64
|
> width = 64
|
||||||
> rows = 2000
|
> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||||
> also_embed_subwords = false
|
> rows = [2000, 1000, 1000, 1000]
|
||||||
> also_use_static_vectors = false
|
> include_static_vectors = true
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Construct an embedding layer that separately embeds a number of lexical
|
Construct an embedding layer that separately embeds a number of lexical
|
||||||
attributes using hash embedding, concatenates the results, and passes it through
|
attributes using hash embedding, concatenates the results, and passes it through
|
||||||
a feed-forward subnetwork to build mixed representations. The features used are
|
a feed-forward subnetwork to build a mixed representations. The features used
|
||||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
|
can be configured with the `attrs` argument. The suggested attributes are
|
||||||
[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
|
`NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account
|
||||||
vectors can also be incorporated into the concatenated representation.
|
some subword information, without construction a fully character-based
|
||||||
|
representation. If pretrained vectors are available, they can be included in the
|
||||||
|
representation as well, with the vectors table will be kept static (i.e. it's
|
||||||
|
not updated).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
|
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |
|
||||||
| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
|
| `attrs` | The token attributes to embed. A separate embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ |
|
||||||
| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
|
| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. The layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, even for very large vocabularies. A number of rows must be specified for each table, so the `rows` list must be of the same length as the `attrs` parameter. ~~List[int]~~ |
|
||||||
| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
|
| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
||||||
|
|
||||||
|
|
|
@ -226,6 +226,12 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
||||||
Find the loss and gradient of loss for the batch of documents and their
|
Find the loss and gradient of loss for the batch of documents and their
|
||||||
predicted scores.
|
predicted scores.
|
||||||
|
|
||||||
|
<Infobox variant="danger">
|
||||||
|
|
||||||
|
This method needs to be overwritten with your own custom `get_loss` method.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
|
|
|
@ -86,7 +86,8 @@ see are:
|
||||||
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
|
| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. |
|
||||||
| ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. |
|
| ~~Padded~~ | A container to handle variable-length sequence data in a padded contiguous array. |
|
||||||
|
|
||||||
The model type signatures help you figure out which model architectures and
|
See the [Thinc type reference](https://thinc.ai/docs/api-types) for details. The
|
||||||
|
model type signatures help you figure out which model architectures and
|
||||||
components can **fit together**. For instance, the
|
components can **fit together**. For instance, the
|
||||||
[`TextCategorizer`](/api/textcategorizer) class expects a model typed
|
[`TextCategorizer`](/api/textcategorizer) class expects a model typed
|
||||||
~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
|
~~Model[List[Doc], Floats2d]~~, because the model will predict one row of
|
||||||
|
@ -288,7 +289,7 @@ those parts of the network.
|
||||||
|
|
||||||
To use our custom model including the PyTorch subnetwork, all we need to do is
|
To use our custom model including the PyTorch subnetwork, all we need to do is
|
||||||
register the architecture using the
|
register the architecture using the
|
||||||
[`architectures` registry](/api/top-level#registry). This will assign the
|
[`architectures` registry](/api/top-level#registry). This assigns the
|
||||||
architecture a name so spaCy knows how to find it, and allows passing in
|
architecture a name so spaCy knows how to find it, and allows passing in
|
||||||
arguments like hyperparameters via the [config](/usage/training#config). The
|
arguments like hyperparameters via the [config](/usage/training#config). The
|
||||||
full example then becomes:
|
full example then becomes:
|
||||||
|
@ -373,7 +374,7 @@ gpu_allocator = "pytorch"
|
||||||
Of course it's also possible to define the `Model` from the previous section
|
Of course it's also possible to define the `Model` from the previous section
|
||||||
entirely in Thinc. The Thinc documentation provides details on the
|
entirely in Thinc. The Thinc documentation provides details on the
|
||||||
[various layers](https://thinc.ai/docs/api-layers) and helper functions
|
[various layers](https://thinc.ai/docs/api-layers) and helper functions
|
||||||
available. Combinators can also be used to
|
available. Combinators can be used to
|
||||||
[overload operators](https://thinc.ai/docs/usage-models#operators) and a common
|
[overload operators](https://thinc.ai/docs/usage-models#operators) and a common
|
||||||
usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
|
usage pattern is to bind `chain` to `>>`. The "native" Thinc version of our
|
||||||
simple neural network would then become:
|
simple neural network would then become:
|
||||||
|
@ -486,28 +487,376 @@ with Model.define_operators({">>": chain}):
|
||||||
|
|
||||||
## Create new trainable components {#components}
|
## Create new trainable components {#components}
|
||||||
|
|
||||||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
In addition to [swapping out](#swap-architectures) default models in built-in
|
||||||
|
components, you can also implement an entirely new,
|
||||||
|
[trainable](/usage/processing-pipelines#trainable-components) pipeline component
|
||||||
|
from scratch. This can be done by creating a new class inheriting from
|
||||||
|
[`Pipe`](/api/pipe), and linking it up to your custom model implementation.
|
||||||
|
|
||||||
|
<Infobox title="Trainable component API" emoji="💡">
|
||||||
|
|
||||||
|
For details on how to implement pipeline components, check out the usage guide
|
||||||
|
on [custom components](/usage/processing-pipelines#custom-component) and the
|
||||||
|
overview of the `Pipe` methods used by
|
||||||
|
[trainable components](/usage/processing-pipelines#trainable-components).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
<!-- TODO: write trainable component section
|
### Example: Entity elation extraction component {#component-rel}
|
||||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
|
||||||
- Initialization life-cycle with `initialize`, correlation with add_label
|
|
||||||
Example: relation extraction component (implemented as project template)
|
|
||||||
Avoid duplication with usage/processing-pipelines#trainable-components ?
|
|
||||||
-->
|
|
||||||
|
|
||||||
<!-- 
|
This section outlines an example use-case of implementing a **novel relation
|
||||||
|
extraction component** from scratch. We'll implement a binary relation
|
||||||
|
extraction method that determines whether or not **two entities** in a document
|
||||||
|
are related, and if so, what type of relation. We'll allow multiple types of
|
||||||
|
relations between two such entities (multi-label setting). There are two major
|
||||||
|
steps required:
|
||||||
|
|
||||||
|
1. Implement a [machine learning model](#component-rel-model) specific to this
|
||||||
|
task. It will have to extract candidates from a [`Doc`](/api/doc) and predict
|
||||||
|
a relation for the available candidate pairs.
|
||||||
|
2. Implement a custom [pipeline component](#component-rel-pipe) powered by the
|
||||||
|
machine learning model that sets annotations on the [`Doc`](/api/doc) passing
|
||||||
|
through the pipeline.
|
||||||
|
|
||||||
|
<!-- TODO: <Project id="tutorials/ner-relations">
|
||||||
|
|
||||||
|
</Project> -->
|
||||||
|
|
||||||
|
#### Step 1: Implementing the Model {#component-rel-model}
|
||||||
|
|
||||||
|
We need to implement a [`Model`](https://thinc.ai/docs/api-model) that takes a
|
||||||
|
**list of documents** (~~List[Doc]~~) as input, and outputs a **two-dimensional
|
||||||
|
matrix** (~~Floats2d~~) of predictions:
|
||||||
|
|
||||||
|
> #### Model type annotations
|
||||||
|
>
|
||||||
|
> The `Model` class is a generic type that can specify its input and output
|
||||||
|
> types, e.g. ~~Model[List[Doc], Floats2d]~~. Type hints are used for static
|
||||||
|
> type checks and validation. See the section on [type signatures](#type-sigs)
|
||||||
|
> for details.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def update(self, examples):
|
### Register the model architecture
|
||||||
docs = [ex.predicted for ex in examples]
|
@registry.architectures.register("rel_model.v1")
|
||||||
refs = [ex.reference for ex in examples]
|
def create_relation_model(...) -> Model[List[Doc], Floats2d]:
|
||||||
predictions, backprop = self.model.begin_update(docs)
|
model = ... # 👈 model will go here
|
||||||
gradient = self.get_loss(predictions, refs)
|
return model
|
||||||
backprop(gradient)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
predictions = self.model([doc])
|
|
||||||
self.set_annotations(predictions)
|
|
||||||
```
|
```
|
||||||
-->
|
|
||||||
|
The first layer in this model will typically be an
|
||||||
|
[embedding layer](/usage/embeddings-transformers) such as a
|
||||||
|
[`Tok2Vec`](/api/tok2vec) component or a [`Transformer`](/api/transformer). This
|
||||||
|
layer is assumed to be of type ~~Model[List[Doc], List[Floats2d]]~~ as it
|
||||||
|
transforms each **document into a list of tokens**, with each token being
|
||||||
|
represented by its embedding in the vector space.
|
||||||
|
|
||||||
|
Next, we need a method that **generates pairs of entities** that we want to
|
||||||
|
classify as being related or not. As these candidate pairs are typically formed
|
||||||
|
within one document, this function takes a [`Doc`](/api/doc) as input and
|
||||||
|
outputs a `List` of `Span` tuples. For instance, a very straightforward
|
||||||
|
implementation would be to just take any two entities from the same document:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Simple candiate generation
|
||||||
|
def get_candidates(doc: Doc) -> List[Tuple[Span, Span]]:
|
||||||
|
candidates = []
|
||||||
|
for ent1 in doc.ents:
|
||||||
|
for ent2 in doc.ents:
|
||||||
|
candidates.append((ent1, ent2))
|
||||||
|
return candidates
|
||||||
|
```
|
||||||
|
|
||||||
|
But we could also refine this further by **excluding relations** of an entity
|
||||||
|
with itself, and posing a **maximum distance** (in number of tokens) between two
|
||||||
|
entities. We register this function in the
|
||||||
|
[`@misc` registry](/api/top-level#registry) so we can refer to it from the
|
||||||
|
config, and easily swap it out for any other candidate generation function.
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "rel_model.v1"
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
>
|
||||||
|
> [model.get_candidates]
|
||||||
|
> @misc = "rel_cand_generator.v1"
|
||||||
|
> max_length = 20
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Extended candidate generation {highlight="1,2,7,8"}
|
||||||
|
@registry.misc.register("rel_cand_generator.v1")
|
||||||
|
def create_candidate_indices(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
|
||||||
|
def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
|
||||||
|
candidates = []
|
||||||
|
for ent1 in doc.ents:
|
||||||
|
for ent2 in doc.ents:
|
||||||
|
if ent1 != ent2:
|
||||||
|
if max_length and abs(ent2.start - ent1.start) <= max_length:
|
||||||
|
candidates.append((ent1, ent2))
|
||||||
|
return candidates
|
||||||
|
return get_candidates
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, we require a method that transforms the candidate entity pairs into a
|
||||||
|
2D tensor using the specified [`Tok2Vec`](/api/tok2vec) or
|
||||||
|
[`Transformer`](/api/transformer). The resulting ~~Floats2~~ object will then be
|
||||||
|
processed by a final `output_layer` of the network. Putting all this together,
|
||||||
|
we can define our relation model in a config file as such:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg
|
||||||
|
[model]
|
||||||
|
@architectures = "rel_model.v1"
|
||||||
|
# ...
|
||||||
|
|
||||||
|
[model.tok2vec]
|
||||||
|
# ...
|
||||||
|
|
||||||
|
[model.get_candidates]
|
||||||
|
@misc = "rel_cand_generator.v2"
|
||||||
|
max_length = 20
|
||||||
|
|
||||||
|
[model.create_candidate_tensor]
|
||||||
|
@misc = "rel_cand_tensor.v1"
|
||||||
|
|
||||||
|
[model.output_layer]
|
||||||
|
@architectures = "rel_output_layer.v1"
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- TODO: link to project for implementation details -->
|
||||||
|
<!-- TODO: maybe embed files from project that show the architectures? -->
|
||||||
|
|
||||||
|
When creating this model, we store the custom functions as
|
||||||
|
[attributes](https://thinc.ai/docs/api-model#properties) and the sublayers as
|
||||||
|
references, so we can access them easily:
|
||||||
|
|
||||||
|
```python
|
||||||
|
tok2vec_layer = model.get_ref("tok2vec")
|
||||||
|
output_layer = model.get_ref("output_layer")
|
||||||
|
create_candidate_tensor = model.attrs["create_candidate_tensor"]
|
||||||
|
get_candidates = model.attrs["get_candidates"]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 2: Implementing the pipeline component {#component-rel-pipe}
|
||||||
|
|
||||||
|
To use our new relation extraction model as part of a custom
|
||||||
|
[trainable component](/usage/processing-pipelines#trainable-components), we
|
||||||
|
create a subclass of [`Pipe`](/api/pipe) that holds the model:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Pipeline component skeleton
|
||||||
|
from spacy.pipeline import Pipe
|
||||||
|
|
||||||
|
class RelationExtractor(Pipe):
|
||||||
|
def __init__(self, vocab, model, name="rel"):
|
||||||
|
"""Create a component instance."""
|
||||||
|
self.model = model
|
||||||
|
self.vocab = vocab
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def update(self, examples, drop=0.0, set_annotations=False, sgd=None, losses=None):
|
||||||
|
"""Learn from a batch of Example objects."""
|
||||||
|
...
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
"""Apply the model to a batch of Doc objects."""
|
||||||
|
...
|
||||||
|
|
||||||
|
def set_annotations(self, docs, predictions):
|
||||||
|
"""Modify a batch of Doc objects using the predictions."""
|
||||||
|
...
|
||||||
|
|
||||||
|
def initialize(self, get_examples, nlp=None, labels=None):
|
||||||
|
"""Initialize the model before training."""
|
||||||
|
...
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
"""Add a label to the component."""
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
Before the model can be used, it needs to be
|
||||||
|
[initialized](/usage/training#initialization). This function receives a callback
|
||||||
|
to access the full **training data set**, or a representative sample. This data
|
||||||
|
set can be used to deduce all **relevant labels**. Alternatively, a list of
|
||||||
|
labels can be provided to `initialize`, or you can call the
|
||||||
|
`RelationExtractoradd_label` directly. The number of labels defines the output
|
||||||
|
dimensionality of the network, and will be used to do
|
||||||
|
[shape inference](https://thinc.ai/docs/usage-models#validation) throughout the
|
||||||
|
layers of the neural network. This is triggered by calling
|
||||||
|
[`Model.initialize`](https://thinc.ai/api/model#initialize).
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The initialize method {highlight="12,18,22"}
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
|
def initialize(
|
||||||
|
self,
|
||||||
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
|
*,
|
||||||
|
nlp: Language = None,
|
||||||
|
labels: Optional[List[str]] = None,
|
||||||
|
):
|
||||||
|
if labels is not None:
|
||||||
|
for label in labels:
|
||||||
|
self.add_label(label)
|
||||||
|
else:
|
||||||
|
for example in get_examples():
|
||||||
|
relations = example.reference._.rel
|
||||||
|
for indices, label_dict in relations.items():
|
||||||
|
for label in label_dict.keys():
|
||||||
|
self.add_label(label)
|
||||||
|
subbatch = list(islice(get_examples(), 10))
|
||||||
|
doc_sample = [eg.reference for eg in subbatch]
|
||||||
|
label_sample = self._examples_to_truth(subbatch)
|
||||||
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `initialize` method is triggered whenever this component is part of an `nlp`
|
||||||
|
pipeline, and [`nlp.initialize`](/api/language#initialize) is invoked.
|
||||||
|
Typically, this happens when the pipeline is set up before training in
|
||||||
|
[`spacy train`](/api/cli#training). After initialization, the pipeline component
|
||||||
|
and its internal model can be trained and used to make predictions.
|
||||||
|
|
||||||
|
During training, the function [`update`](/api/pipe#update) is invoked which
|
||||||
|
delegates to
|
||||||
|
[`Model.begin_update`](https://thinc.ai/docs/api-model#begin_update) and a
|
||||||
|
[`get_loss`](/api/pipe#get_loss) function that **calculate the loss** for a
|
||||||
|
batch of examples, as well as the **gradient** of loss that will be used to
|
||||||
|
update the weights of the model layers. Thinc provides several
|
||||||
|
[loss functions](https://thinc.ai/docs/api-loss) that can be used for the
|
||||||
|
implementation of the `get_loss` function.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The update method {highlight="12-14"}
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
examples: Iterable[Example],
|
||||||
|
*,
|
||||||
|
drop: float = 0.0,
|
||||||
|
set_annotations: bool = False,
|
||||||
|
sgd: Optional[Optimizer] = None,
|
||||||
|
losses: Optional[Dict[str, float]] = None,
|
||||||
|
) -> Dict[str, float]:
|
||||||
|
...
|
||||||
|
docs = [ex.predicted for ex in examples]
|
||||||
|
predictions, backprop = self.model.begin_update(docs)
|
||||||
|
loss, gradient = self.get_loss(examples, predictions)
|
||||||
|
backprop(gradient)
|
||||||
|
losses[self.name] += loss
|
||||||
|
...
|
||||||
|
return losses
|
||||||
|
```
|
||||||
|
|
||||||
|
When the internal model is trained, the component can be used to make novel
|
||||||
|
**predictions**. The [`predict`](/api/pipe#predict) function needs to be
|
||||||
|
implemented for each subclass of `Pipe`. In our case, we can simply delegate to
|
||||||
|
the internal model's [predict](https://thinc.ai/docs/api-model#predict) function
|
||||||
|
that takes a batch of `Doc` objects and returns a ~~Floats2d~~ array:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The predict method
|
||||||
|
def predict(self, docs: Iterable[Doc]) -> Floats2d:
|
||||||
|
predictions = self.model.predict(docs)
|
||||||
|
return self.model.ops.asarray(predictions)
|
||||||
|
```
|
||||||
|
|
||||||
|
The final method that needs to be implemented, is
|
||||||
|
[`set_annotations`](/api/pipe#set_annotations). This function takes the
|
||||||
|
predictions, and modifies the given `Doc` object in place to store them. For our
|
||||||
|
relation extraction component, we store the data as a dictionary in a custom
|
||||||
|
[extension attribute](/usage/processing-pipelines#custom-components-attributes)
|
||||||
|
`doc._.rel`. As keys, we represent the candidate pair by the **start offsets of
|
||||||
|
each entity**, as this defines an entity pair uniquely within one document.
|
||||||
|
|
||||||
|
To interpret the scores predicted by the relation extraction model correctly, we
|
||||||
|
need to refer to the model's `get_candidates` function that defined which pairs
|
||||||
|
of entities were relevant candidates, so that the predictions can be linked to
|
||||||
|
those exact entities:
|
||||||
|
|
||||||
|
> #### Example output
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> doc = nlp("Amsterdam is the capital of the Netherlands.")
|
||||||
|
> print("spans", [(e.start, e.text, e.label_) for e in doc.ents])
|
||||||
|
> for value, rel_dict in doc._.rel.items():
|
||||||
|
> print(f"{value}: {rel_dict}")
|
||||||
|
>
|
||||||
|
> # spans [(0, 'Amsterdam', 'LOC'), (6, 'Netherlands', 'LOC')]
|
||||||
|
> # (0, 6): {'CAPITAL_OF': 0.89, 'LOCATED_IN': 0.75, 'UNRELATED': 0.002}
|
||||||
|
> # (6, 0): {'CAPITAL_OF': 0.01, 'LOCATED_IN': 0.13, 'UNRELATED': 0.017}
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Registering the extension attribute
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
Doc.set_extension("rel", default={})
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The set_annotations method {highlight="5-6,10"}
|
||||||
|
def set_annotations(self, docs: Iterable[Doc], predictions: Floats2d):
|
||||||
|
c = 0
|
||||||
|
get_candidates = self.model.attrs["get_candidates"]
|
||||||
|
for doc in docs:
|
||||||
|
for (e1, e2) in get_candidates(doc):
|
||||||
|
offset = (e1.start, e2.start)
|
||||||
|
if offset not in doc._.rel:
|
||||||
|
doc._.rel[offset] = {}
|
||||||
|
for j, label in enumerate(self.labels):
|
||||||
|
doc._.rel[offset][label] = predictions[c, j]
|
||||||
|
c += 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Under the hood, when the pipe is applied to a document, it delegates to the
|
||||||
|
`predict` and `set_annotations` methods:
|
||||||
|
|
||||||
|
```python
|
||||||
|
### The __call__ method
|
||||||
|
def __call__(self, Doc doc):
|
||||||
|
predictions = self.predict([doc])
|
||||||
|
self.set_annotations([doc], predictions)
|
||||||
|
return doc
|
||||||
|
```
|
||||||
|
|
||||||
|
Once our `Pipe` subclass is fully implemented, we can
|
||||||
|
[register](/usage/processing-pipelines#custom-components-factories) the
|
||||||
|
component with the [`@Language.factory`](/api/lnguage#factory) decorator. This
|
||||||
|
assigns it a name and lets you create the component with
|
||||||
|
[`nlp.add_pipe`](/api/language#add_pipe) and via the
|
||||||
|
[config](/usage/training#config).
|
||||||
|
|
||||||
|
> #### config.cfg (excerpt)
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [components.relation_extractor]
|
||||||
|
> factory = "relation_extractor"
|
||||||
|
>
|
||||||
|
> [components.relation_extractor.model]
|
||||||
|
> @architectures = "rel_model.v1"
|
||||||
|
>
|
||||||
|
> [components.relation_extractor.model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
>
|
||||||
|
> [components.relation_extractor.model.get_candidates]
|
||||||
|
> @misc = "rel_cand_generator.v1"
|
||||||
|
> max_length = 20
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### Registering the pipeline component
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
@Language.factory("relation_extractor")
|
||||||
|
def make_relation_extractor(nlp, name, model):
|
||||||
|
return RelationExtractor(nlp.vocab, model, name)
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- TODO: <Project id="tutorials/ner-relations">
|
||||||
|
|
||||||
|
</Project> -->
|
||||||
|
|
|
@ -1176,7 +1176,7 @@ plug fully custom machine learning components into your pipeline. You'll need
|
||||||
the following:
|
the following:
|
||||||
|
|
||||||
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
|
1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This
|
||||||
can be a model using implemented in
|
can be a model implemented in
|
||||||
[Thinc](/usage/layers-architectures#thinc), or a
|
[Thinc](/usage/layers-architectures#thinc), or a
|
||||||
[wrapped model](/usage/layers-architectures#frameworks) implemented in
|
[wrapped model](/usage/layers-architectures#frameworks) implemented in
|
||||||
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a
|
||||||
|
|
|
@ -216,15 +216,16 @@ pipelines.
|
||||||
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
| Section | Description |
|
| Section | Description |
|
||||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. |
|
||||||
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. |
|
||||||
|
| `spacy_version` | Optional spaCy version range like `>=3.0.0,<3.1.0` that the project is compatible with. If it's loaded with an incompatible version, an error is raised when the project is loaded. |
|
||||||
|
|
||||||
### Data assets {#data-assets}
|
### Data assets {#data-assets}
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
cursor: pointer
|
cursor: pointer
|
||||||
display: inline-block
|
display: inline-block
|
||||||
padding: 0.35rem 0.5rem 0.25rem 0
|
padding: 0.35rem 0.5rem 0.25rem 0
|
||||||
margin: 0 1rem 0.75rem 0
|
margin: 0 1rem 0.5rem 0
|
||||||
font-size: var(--font-size-xs)
|
font-size: var(--font-size-xs)
|
||||||
font-weight: bold
|
font-weight: bold
|
||||||
|
|
||||||
|
@ -73,16 +73,19 @@
|
||||||
background: var(--color-theme)
|
background: var(--color-theme)
|
||||||
|
|
||||||
.checkbox + &:before
|
.checkbox + &:before
|
||||||
|
$size: 18px
|
||||||
content: ""
|
content: ""
|
||||||
display: inline-block
|
display: inline-block
|
||||||
width: 20px
|
width: $size
|
||||||
height: 20px
|
height: $size
|
||||||
border: 1px solid var(--color-subtle)
|
border: 1px solid var(--color-subtle)
|
||||||
vertical-align: middle
|
vertical-align: middle
|
||||||
margin-right: 0.5rem
|
margin-right: 0.5rem
|
||||||
cursor: pointer
|
cursor: pointer
|
||||||
border-radius: var(--border-radius)
|
border-radius: $size / 4
|
||||||
background: var(--color-back)
|
background: var(--color-back)
|
||||||
|
position: relative
|
||||||
|
top: -1px
|
||||||
|
|
||||||
.checkbox:checked + &:before
|
.checkbox:checked + &:before
|
||||||
// Embed "check" icon here for simplicity
|
// Embed "check" icon here for simplicity
|
||||||
|
|
|
@ -4,6 +4,8 @@ import { StaticQuery, graphql } from 'gatsby'
|
||||||
import { Quickstart, QS } from '../components/quickstart'
|
import { Quickstart, QS } from '../components/quickstart'
|
||||||
import { repo } from '../components/util'
|
import { repo } from '../components/util'
|
||||||
|
|
||||||
|
const DEFAULT_MODELS = ['en']
|
||||||
|
const DEFAULT_OPT = 'efficiency'
|
||||||
const DEFAULT_HARDWARE = 'cpu'
|
const DEFAULT_HARDWARE = 'cpu'
|
||||||
const DEFAULT_CUDA = 'cuda100'
|
const DEFAULT_CUDA = 'cuda100'
|
||||||
const CUDA = {
|
const CUDA = {
|
||||||
|
@ -15,6 +17,7 @@ const CUDA = {
|
||||||
'10.1': 'cuda101',
|
'10.1': 'cuda101',
|
||||||
'10.2': 'cuda102',
|
'10.2': 'cuda102',
|
||||||
}
|
}
|
||||||
|
const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
|
||||||
const DATA = [
|
const DATA = [
|
||||||
{
|
{
|
||||||
id: 'os',
|
id: 'os',
|
||||||
|
@ -68,14 +71,24 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
const [train, setTrain] = useState(false)
|
const [train, setTrain] = useState(false)
|
||||||
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
|
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
|
||||||
const [cuda, setCuda] = useState(DEFAULT_CUDA)
|
const [cuda, setCuda] = useState(DEFAULT_CUDA)
|
||||||
|
const [selectedModels, setModels] = useState(DEFAULT_MODELS)
|
||||||
|
const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
|
||||||
const setters = {
|
const setters = {
|
||||||
hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
|
hardware: v => (Array.isArray(v) ? setHardware(v[0]) : setCuda(v)),
|
||||||
config: v => setTrain(v.includes('train')),
|
config: v => setTrain(v.includes('train')),
|
||||||
|
models: setModels,
|
||||||
|
optimize: v => setEfficiency(v.includes('efficiency')),
|
||||||
}
|
}
|
||||||
const showDropdown = {
|
const showDropdown = {
|
||||||
hardware: () => hardware === 'gpu',
|
hardware: () => hardware === 'gpu',
|
||||||
}
|
}
|
||||||
const pipExtras = [hardware === 'gpu' && cuda, train && 'transformers', train && 'lookups']
|
const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
|
||||||
|
const pipExtras = [
|
||||||
|
hardware === 'gpu' && cuda,
|
||||||
|
train && 'transformers',
|
||||||
|
train && 'lookups',
|
||||||
|
...modelExtras,
|
||||||
|
]
|
||||||
.filter(e => e)
|
.filter(e => e)
|
||||||
.join(',')
|
.join(',')
|
||||||
return (
|
return (
|
||||||
|
@ -89,13 +102,37 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
...DATA,
|
...DATA,
|
||||||
{
|
{
|
||||||
id: 'models',
|
id: 'models',
|
||||||
title: 'Trained Pipelines',
|
title: 'Trained pipelines',
|
||||||
multiple: true,
|
multiple: true,
|
||||||
options: models
|
options: models
|
||||||
.sort((a, b) => a.name.localeCompare(b.name))
|
.sort((a, b) => a.name.localeCompare(b.name))
|
||||||
.map(({ code, name }) => ({ id: code, title: name })),
|
.map(({ code, name }) => ({
|
||||||
|
id: code,
|
||||||
|
title: name,
|
||||||
|
checked: DEFAULT_MODELS.includes(code),
|
||||||
|
})),
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
if (selectedModels.length) {
|
||||||
|
data.push({
|
||||||
|
id: 'optimize',
|
||||||
|
title: 'Select pipeline for',
|
||||||
|
options: [
|
||||||
|
{
|
||||||
|
id: 'efficiency',
|
||||||
|
title: 'efficiency',
|
||||||
|
checked: DEFAULT_OPT === 'efficiency',
|
||||||
|
help: 'Faster and smaller pipeline, but less accurate',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'accuracy',
|
||||||
|
title: 'accuracy',
|
||||||
|
checked: DEFAULT_OPT === 'accuracy',
|
||||||
|
help: 'Larger and slower pipeline, but more accurate',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
})
|
||||||
|
}
|
||||||
return (
|
return (
|
||||||
<Quickstart
|
<Quickstart
|
||||||
data={data}
|
data={data}
|
||||||
|
@ -149,11 +186,14 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
conda install -c conda-forge spacy-lookups-data
|
conda install -c conda-forge spacy-lookups-data
|
||||||
</QS>
|
</QS>
|
||||||
|
|
||||||
{models.map(({ code, models: modelOptions }) => (
|
{models.map(({ code, models: modelOptions }) => {
|
||||||
<QS models={code} key={code}>
|
const pkg = modelOptions[efficiency ? 0 : modelOptions.length - 1]
|
||||||
python -m spacy download {modelOptions[0]}
|
return (
|
||||||
</QS>
|
<QS models={code} key={code}>
|
||||||
))}
|
python -m spacy download {pkg}
|
||||||
|
</QS>
|
||||||
|
)
|
||||||
|
})}
|
||||||
</Quickstart>
|
</Quickstart>
|
||||||
)
|
)
|
||||||
}}
|
}}
|
||||||
|
|
|
@ -31,25 +31,33 @@ const data = [
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'optimize',
|
id: 'optimize',
|
||||||
title: 'Optimize for',
|
title: 'Select for',
|
||||||
help:
|
|
||||||
'Optimize for efficiency (faster & smaller model) or higher accuracy (larger & slower model)',
|
|
||||||
options: [
|
options: [
|
||||||
{ id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' },
|
{
|
||||||
{ id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' },
|
id: 'efficiency',
|
||||||
|
title: 'efficiency',
|
||||||
|
checked: DEFAULT_OPT === 'efficiency',
|
||||||
|
help: 'Faster and smaller pipeline, but less accurate',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'accuracy',
|
||||||
|
title: 'accuracy',
|
||||||
|
checked: DEFAULT_OPT === 'accuracy',
|
||||||
|
help: 'Larger and slower pipeline, but more accurate',
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'config',
|
id: 'config',
|
||||||
title: 'Options',
|
title: 'Options',
|
||||||
multiple: true,
|
multiple: true,
|
||||||
options: [{ id: 'example', title: 'Show usage example' }],
|
options: [{ id: 'example', title: 'Show text example' }],
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
const QuickstartInstall = ({ id, title, description, children }) => {
|
const QuickstartInstall = ({ id, title, description, children }) => {
|
||||||
const [lang, setLang] = useState(DEFAULT_LANG)
|
const [lang, setLang] = useState(DEFAULT_LANG)
|
||||||
const [efficiency, setEfficiency] = useState(DEFAULT_OPT)
|
const [efficiency, setEfficiency] = useState(DEFAULT_OPT === 'efficiency')
|
||||||
const setters = {
|
const setters = {
|
||||||
lang: setLang,
|
lang: setLang,
|
||||||
optimize: v => setEfficiency(v.includes('efficiency')),
|
optimize: v => setEfficiency(v.includes('efficiency')),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user